From 55a37f987c55888c2dd6ae9e3cf020b032d3e2de Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:20:37 -0800 Subject: [PATCH 001/877] add conf file for sphinx documentation --- docs/source/conf.py | 68 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 docs/source/conf.py diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..5926551 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,68 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + + +# -- Project information ----------------------------------------------------- + +project = "MaveCore" +copyright = "2022, Alan F Rubin" +author = "Alan F Rubin" + +# The full version, including alpha/beta/rc tags +release = "0.0.1" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.autosectionlabel", +] +nbsphinx_allow_errors = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# Intersphinx information for documentation from other packages +intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "pyramid" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["static"] + + +def setup(app): + app.add_css_file("styles.css") From 31b62a17a8d0d0b33b76796d00c7f7c9f36052e6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:21:39 -0800 Subject: [PATCH 002/877] add index.rst file for sphinx documentation --- docs/source/index.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 docs/source/index.rst diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..e9311c6 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,31 @@ +.. MaveCore documentation master file, created by + sphinx-quickstart on Tue Mar 1 14:20:15 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +MaveCore +========= +MaveTools is a pure Python Module for bioinformatics and computational biology. +It features all the shared functionality of MaveDB and MaveTools. + +Install MaveCore using pip:: + + pip3 install MaveCore + +Building a local copy of the documentation requires the following additional packages:: + + pip3 install sphinx + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + validators + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` From a69ad7535d3158dcc1ab90cfc5263b4c2e6adc7b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:21:56 -0800 Subject: [PATCH 003/877] add lato-fonts.css file for sphinx documentation --- docs/source/static/lato-fonts.css | 108 ++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 docs/source/static/lato-fonts.css diff --git a/docs/source/static/lato-fonts.css b/docs/source/static/lato-fonts.css new file mode 100644 index 0000000..f21b9cc --- /dev/null +++ b/docs/source/static/lato-fonts.css @@ -0,0 +1,108 @@ +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI9w2_FQft1dw.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI9w2_Gwft.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u8w4BMUTPHjxsAUi-qJCY.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u8w4BMUTPHjxsAXC-q.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI5wq_FQft1dw.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: italic; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u_w4BMUTPHjxsI5wq_Gwft.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh7USSwaPGR_p.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh7USSwiPGQ.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6uyw4BMUTPHjx4wXg.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* latin-ext */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Lato'; + font-style: normal; + font-weight: 700; + font-display: swap; + src: url(https://fonts.gstatic.com/s/lato/v20/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} From 640701c3259c42bc6107167b8de674eedc363d9c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:22:06 -0800 Subject: [PATCH 004/877] add make.bat file for sphinx documentation --- docs/make.bat | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 docs/make.bat diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..061f32f --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd From 67ee81624cc8cf4b8b1d93819289b49269c229af Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:22:17 -0800 Subject: [PATCH 005/877] add Makefile file for sphinx documentation --- docs/Makefile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 docs/Makefile diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..92dd33a --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) From 6e54418f94801dd1e3eb1facf6a7c02f92ec7e7d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:22:31 -0800 Subject: [PATCH 006/877] add raleway-fonts.css file for sphinx documentation --- docs/source/static/raleway-fonts.css | 360 +++++++++++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 docs/source/static/raleway-fonts.css diff --git a/docs/source/static/raleway-fonts.css b/docs/source/static/raleway-fonts.css new file mode 100644 index 0000000..1918a50 --- /dev/null +++ b/docs/source/static/raleway-fonts.css @@ -0,0 +1,360 @@ +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4ejMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4TbMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4WjMDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDr4fIA9c.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrcfIA9c.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrwfIA9c.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDr0fIA9c.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: italic; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Pt_g8zYS_SKggPNyCgSQamb1W0lwk4S4bbLDrMfIA.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 200; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVtaorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 300; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVuEorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 400; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVvaorCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} +/* cyrillic-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCFPrEHJA.woff2) format('woff2'); + unicode-range: U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F; +} +/* cyrillic */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCMPrEHJA.woff2) format('woff2'); + unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; +} +/* vietnamese */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCHPrEHJA.woff2) format('woff2'); + unicode-range: U+0102-0103, U+0110-0111, U+0128-0129, U+0168-0169, U+01A0-01A1, U+01AF-01B0, U+1EA0-1EF9, U+20AB; +} +/* latin-ext */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCGPrEHJA.woff2) format('woff2'); + unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; +} +/* latin */ +@font-face { + font-family: 'Raleway'; + font-style: normal; + font-weight: 600; + font-display: swap; + src: url(https://fonts.gstatic.com/s/raleway/v22/1Ptxg8zYS_SKggPN4iEgvnHyvveLxVsEpbCIPrE.woff2) format('woff2'); + unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; +} From 315e0b0d6d6733bb0a622c0a541ec4d5a079b9b2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:22:42 -0800 Subject: [PATCH 007/877] add styles.css file for sphinx documentation --- docs/source/static/styles.css | 41 +++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 docs/source/static/styles.css diff --git a/docs/source/static/styles.css b/docs/source/static/styles.css new file mode 100644 index 0000000..4e7affd --- /dev/null +++ b/docs/source/static/styles.css @@ -0,0 +1,41 @@ +@import url("lato-fonts.css"); +@import url("raleway-fonts.css"); + +body { + font-family: "Lato", sans-serif; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: "Raleway", sans-serif; +} + +div.sphinxsidebar { + font-size: 0.85em; +} + +div.related { + font-size: 0.85em; +} + +div.sphinxsidebar h1, +div.sphinxsidebar h2, +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: "Raleway", sans-serif; +} + +table caption { + margin-bottom: 10px; + font-family: "Raleway", sans-serif; +} + +figcaption span.caption-text, figcaption span.caption-number { + font-family: "Raleway", sans-serif; + font-style: normal; + font-weight: bold; +} From 05ff99f208cb3154bbbc39ad6290979439b9f970 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:22:54 -0800 Subject: [PATCH 008/877] add validators.rst file for sphinx documentation --- docs/source/validators.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/source/validators.rst diff --git a/docs/source/validators.rst b/docs/source/validators.rst new file mode 100644 index 0000000..e58b577 --- /dev/null +++ b/docs/source/validators.rst @@ -0,0 +1,8 @@ +validators +========== + +validators features user-side mave dataset validators functions +that replicate some of the server-side validation done in MaveDB. + +.. automodule:: mavecore.validators.dataset_validators + :members: From ad6a5f34281ff1acbe8120d609cf1c0ad9646757 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:28:24 -0800 Subject: [PATCH 009/877] refactor, rename validator and test_validator directories to validation and test_validation --- mavecore/{validators => validation}/__init__.py | 0 mavecore/{validators => validation}/constants.py | 2 +- .../dataset_validators.py | 2 +- .../{validators => validation}/exceptions.py | 0 .../genome_validators.py | 8 ++++---- .../metadata_validators.py | 4 ++-- .../{validators => validation}/urn_validators.py | 2 +- mavecore/{validators => validation}/validate.py | 4 ++-- .../variant_validators/__init__.py | 0 .../variant_validators/dataset.py | 4 ++-- .../variant_validators/hgvs.py | 6 +++--- .../variant_validators/variant.py | 4 ++-- .../__init__.py | 0 .../test_dataset_validators.py | 4 ++-- .../test_genome_validators.py | 16 ++++++++-------- .../test_metadata_validators.py | 4 ++-- .../test_variant_validators/__init__.py | 0 .../test_hgvs_validators.py | 6 +++--- .../test_variant_validators/test_validators.py | 6 +++--- 19 files changed, 36 insertions(+), 36 deletions(-) rename mavecore/{validators => validation}/__init__.py (100%) rename mavecore/{validators => validation}/constants.py (97%) rename mavecore/{validators => validation}/dataset_validators.py (99%) rename mavecore/{validators => validation}/exceptions.py (100%) rename mavecore/{validators => validation}/genome_validators.py (97%) rename mavecore/{validators => validation}/metadata_validators.py (96%) rename mavecore/{validators => validation}/urn_validators.py (98%) rename mavecore/{validators => validation}/validate.py (94%) rename mavecore/{validators => validation}/variant_validators/__init__.py (100%) rename mavecore/{validators => validation}/variant_validators/dataset.py (99%) rename mavecore/{validators => validation}/variant_validators/hgvs.py (94%) rename mavecore/{validators => validation}/variant_validators/variant.py (95%) rename tests/{test_validators => test_validation}/__init__.py (100%) rename tests/{test_validators => test_validation}/test_dataset_validators.py (99%) rename tests/{test_validators => test_validation}/test_genome_validators.py (88%) rename tests/{test_validators => test_validation}/test_metadata_validators.py (97%) rename tests/{test_validators => test_validation}/test_variant_validators/__init__.py (100%) rename tests/{test_validators => test_validation}/test_variant_validators/test_hgvs_validators.py (91%) rename tests/{test_validators => test_validation}/test_variant_validators/test_validators.py (99%) diff --git a/mavecore/validators/__init__.py b/mavecore/validation/__init__.py similarity index 100% rename from mavecore/validators/__init__.py rename to mavecore/validation/__init__.py diff --git a/mavecore/validators/constants.py b/mavecore/validation/constants.py similarity index 97% rename from mavecore/validators/constants.py rename to mavecore/validation/constants.py index 9a6b8fd..f10aab8 100644 --- a/mavecore/validators/constants.py +++ b/mavecore/validation/constants.py @@ -37,7 +37,7 @@ """ Constant definitions for application `experiment`. """ -from mavecore.validators.urn_validators import ( +from mavecore.validation.urn_validators import ( MAVEDB_EXPERIMENTSET_URN_PATTERN, MAVEDB_EXPERIMENT_URN_PATTERN, MAVEDB_SCORESET_URN_PATTERN, diff --git a/mavecore/validators/dataset_validators.py b/mavecore/validation/dataset_validators.py similarity index 99% rename from mavecore/validators/dataset_validators.py rename to mavecore/validation/dataset_validators.py index 5071a2f..7a72e0e 100644 --- a/mavecore/validators/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -4,7 +4,7 @@ from numpy.testing import assert_array_equal -from mavecore.validators import constants +from mavecore.validation import constants def is_null(value): diff --git a/mavecore/validators/exceptions.py b/mavecore/validation/exceptions.py similarity index 100% rename from mavecore/validators/exceptions.py rename to mavecore/validation/exceptions.py diff --git a/mavecore/validators/genome_validators.py b/mavecore/validation/genome_validators.py similarity index 97% rename from mavecore/validators/genome_validators.py rename to mavecore/validation/genome_validators.py index 0065d5e..996eb41 100644 --- a/mavecore/validators/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -6,14 +6,14 @@ ReferenceMap GenomicInterval -Most validators should validate one specific field, unless fields need +Most validation should validate one specific field, unless fields need to be validated against each other. """ import re from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError -from mavecore.validators import constants +from mavecore.validation import constants def is_null(value): @@ -84,7 +84,7 @@ def __str__(self): # blank=False, # null=False, # verbose_name="Reference sequence", - # validators=[validate_wildtype_sequence], + # validation=[validate_wildtype_sequence], # ) # sequence_type = models.CharField( # blank=True, diff --git a/mavecore/validators/metadata_validators.py b/mavecore/validation/metadata_validators.py similarity index 96% rename from mavecore/validators/metadata_validators.py rename to mavecore/validation/metadata_validators.py index d65981c..92bff1b 100644 --- a/mavecore/validators/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,8 +1,8 @@ import re import idutils -from mavecore.validators.exceptions import ValidationError -from mavecore.validators.constants import null_values_re +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants import null_values_re def is_null(value): diff --git a/mavecore/validators/urn_validators.py b/mavecore/validation/urn_validators.py similarity index 98% rename from mavecore/validators/urn_validators.py rename to mavecore/validation/urn_validators.py index 823c537..f44f760 100644 --- a/mavecore/validators/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -1,5 +1,5 @@ import re -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 MAVEDB_TMP_URN_DIGITS = 16 diff --git a/mavecore/validators/validate.py b/mavecore/validation/validate.py similarity index 94% rename from mavecore/validators/validate.py rename to mavecore/validation/validate.py index cd822b6..155bf99 100644 --- a/mavecore/validators/validate.py +++ b/mavecore/validation/validate.py @@ -1,4 +1,4 @@ -from mavecore.validators import dataset_validators +from mavecore.validation import dataset_validators def validate_all(countfile=None, scorefile=None, scorejson=None): @@ -11,7 +11,7 @@ def validate_all(countfile=None, scorefile=None, scorejson=None): def validate_dataset(countfile=None, scorefile=None, scorejson=None): """ This function calls all of the validation functions within - mavetools/mavetools/validators/dataset_validation.py + mavetools/mavetools/validation/dataset_validation.py Returns ------- diff --git a/mavecore/validators/variant_validators/__init__.py b/mavecore/validation/variant_validators/__init__.py similarity index 100% rename from mavecore/validators/variant_validators/__init__.py rename to mavecore/validation/variant_validators/__init__.py diff --git a/mavecore/validators/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py similarity index 99% rename from mavecore/validators/variant_validators/dataset.py rename to mavecore/validation/variant_validators/dataset.py index 0764dcc..3a01f92 100644 --- a/mavecore/validators/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -11,7 +11,7 @@ from fqfa.util.translate import translate_dna from fqfa.util.infer import infer_sequence_type -from mavecore.validators.constants import ( +from mavecore.validation.constants import ( hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, @@ -20,7 +20,7 @@ , ) -from mavecore.validators.constants import NA_value, null_values_list, null_values_re, readable_null_values +from mavecore.validation.constants import NA_value, null_values_list, null_values_re, readable_null_values def is_null(value): diff --git a/mavecore/validators/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py similarity index 94% rename from mavecore/validators/variant_validators/hgvs.py rename to mavecore/validation/variant_validators/hgvs.py index c5aef83..6e157e9 100644 --- a/mavecore/validators/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -3,11 +3,11 @@ import re from mavehgvs import Variant, MaveHgvsParseError -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError -from mavecore.validators.constants import NA_value, null_values_re +from mavecore.validation.constants import NA_value, null_values_re -from mavecore.validators.constants import ( +from mavecore.validation.constants import ( hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, diff --git a/mavecore/validators/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py similarity index 95% rename from mavecore/validators/variant_validators/variant.py rename to mavecore/validation/variant_validators/variant.py index 155426a..5a2d967 100644 --- a/mavecore/validators/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -1,11 +1,11 @@ from typing import Dict -from mavecore.validators.constants import ( +from mavecore.validation.constants import ( variant_score_data, variant_count_data, required_score_column, ) -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError def validate_columns_match(variant, scoreset) -> None: diff --git a/tests/test_validators/__init__.py b/tests/test_validation/__init__.py similarity index 100% rename from tests/test_validators/__init__.py rename to tests/test_validation/__init__.py diff --git a/tests/test_validators/test_dataset_validators.py b/tests/test_validation/test_dataset_validators.py similarity index 99% rename from tests/test_validators/test_dataset_validators.py rename to tests/test_validation/test_dataset_validators.py index b15e16b..3bca487 100644 --- a/tests/test_validators/test_dataset_validators.py +++ b/tests/test_validation/test_dataset_validators.py @@ -4,9 +4,9 @@ import pandas as pd -from mavecore.validators import constants +from mavecore.validation import constants -from mavecore.validators.dataset_validators import ( +from mavecore.validation.dataset_validators import ( validate_scoreset_count_data_input, validate_scoreset_score_data_input, validate_at_least_one_additional_column, diff --git a/tests/test_validators/test_genome_validators.py b/tests/test_validation/test_genome_validators.py similarity index 88% rename from tests/test_validators/test_genome_validators.py rename to tests/test_validation/test_genome_validators.py index b4b06a9..6f36283 100644 --- a/tests/test_validators/test_genome_validators.py +++ b/tests/test_validation/test_genome_validators.py @@ -1,15 +1,15 @@ from unittest import TestCase -from mavecore.validators.genome_validators import WildTypeSequence +from mavecore.validation.genome_validators import WildTypeSequence -# from mavetools.validators.genome_factories import ( +# from mavetools.validation.genome_factories import ( # ReferenceMapFactory, # ReferenceGenomeFactory, # GenomicIntervalFactory, # ) -from mavecore.validators.genome_validators import ( +from mavecore.validation.genome_validators import ( validate_wildtype_sequence, validate_gene_name, validate_genome_short_name, @@ -17,14 +17,14 @@ sequence_is_protein, sequence_is_dna, ) -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError -from mavecore.validators.constants import null_values_list +from mavecore.validation.constants import null_values_list class TestWildTypeSequenceValidators(TestCase): """ - Tests validators associated with :class:`WildTypeSequence`. Tests: + Tests validation associated with :class:`WildTypeSequence`. Tests: - validate_wildtype_sequence """ @@ -105,7 +105,7 @@ def test_false_aa_sequence(self): class TestReferenceGenomeValidators(TestCase): """ - Tests validators associated with :class:`ReferenceGenome`: + Tests validation associated with :class:`ReferenceGenome`: - validate_reference_genome_has_one_external_identifier - validate_organism_name @@ -125,7 +125,7 @@ def test_ve_null_genome_short_name(self): class TestTargetGeneValidators(TestCase): """ - Tests validators asscociated with :class:`TargetGene`: + Tests validation asscociated with :class:`TargetGene`: - validate_gene_name - validate_target_has_one_primary_reference_map diff --git a/tests/test_validators/test_metadata_validators.py b/tests/test_validation/test_metadata_validators.py similarity index 97% rename from tests/test_validators/test_metadata_validators.py rename to tests/test_validation/test_metadata_validators.py index ef32868..88c1f5c 100644 --- a/tests/test_validators/test_metadata_validators.py +++ b/tests/test_validation/test_metadata_validators.py @@ -1,6 +1,6 @@ from unittest import TestCase -from mavecore.validators.metadata_validators import ( +from mavecore.validation.metadata_validators import ( validate_doi_identifier, validate_doi_list, validate_keyword, @@ -16,7 +16,7 @@ validate_ensembl_identifier, validate_ensembl_list, ) -from mavecore.validators.exceptions import ValidationError +from mavecore.validation.exceptions import ValidationError class TestDOIValidators(TestCase): diff --git a/tests/test_validators/test_variant_validators/__init__.py b/tests/test_validation/test_variant_validators/__init__.py similarity index 100% rename from tests/test_validators/test_variant_validators/__init__.py rename to tests/test_validation/test_variant_validators/__init__.py diff --git a/tests/test_validators/test_variant_validators/test_hgvs_validators.py b/tests/test_validation/test_variant_validators/test_hgvs_validators.py similarity index 91% rename from tests/test_validators/test_variant_validators/test_hgvs_validators.py rename to tests/test_validation/test_variant_validators/test_hgvs_validators.py index 9a72ffa..944f435 100644 --- a/tests/test_validators/test_variant_validators/test_hgvs_validators.py +++ b/tests/test_validation/test_variant_validators/test_hgvs_validators.py @@ -1,9 +1,9 @@ # from core.utilities import null_values_list from unittest import TestCase -from mavecore.validators.variant_validators import hgvs -from mavecore.validators.exceptions import ValidationError -from mavecore.validators.constants import null_values_list +from mavecore.validation.variant_validators import hgvs +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants import null_values_list class TestValidateHgvsString(TestCase): diff --git a/tests/test_validators/test_variant_validators/test_validators.py b/tests/test_validation/test_variant_validators/test_validators.py similarity index 99% rename from tests/test_validators/test_variant_validators/test_validators.py rename to tests/test_validation/test_variant_validators/test_validators.py index 404b558..bd144b9 100644 --- a/tests/test_validators/test_variant_validators/test_validators.py +++ b/tests/test_validation/test_variant_validators/test_validators.py @@ -7,11 +7,11 @@ from pandas.testing import assert_index_equal # from dataset import constants -from mavecore.validators import constants -from mavecore.validators.exceptions import ValidationError +from mavecore.validation import constants +from mavecore.validation.exceptions import ValidationError # from ..factories import generate_hgvs, VariantFactory -from mavecore.validators.variant_validators import ( +from mavecore.validation.variant_validators import ( MaveDataset, validate_variant_json, validate_hgvs_string, From 5bfa7f1f12ffb312dd2b78dffeed6d7a6526c84b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 15:27:50 -0800 Subject: [PATCH 010/877] edit automodule to validation --- docs/source/validators.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/validators.rst b/docs/source/validators.rst index e58b577..4b00f18 100644 --- a/docs/source/validators.rst +++ b/docs/source/validators.rst @@ -4,5 +4,5 @@ validators validators features user-side mave dataset validators functions that replicate some of the server-side validation done in MaveDB. -.. automodule:: mavecore.validators.dataset_validators +.. automodule:: mavecore.validation.dataset_validators :members: From ce2c72861be66bd35d54ffd77c58d17f6af3bf7f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 15:28:28 -0800 Subject: [PATCH 011/877] refactor, rename rst file to validation from validators --- docs/source/{validators.rst => validation.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/{validators.rst => validation.rst} (100%) diff --git a/docs/source/validators.rst b/docs/source/validation.rst similarity index 100% rename from docs/source/validators.rst rename to docs/source/validation.rst From d56676d2af45517fac762bfc3be8f9dac8ca5103 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 1 Mar 2022 15:28:47 -0800 Subject: [PATCH 012/877] rename title in rst file --- docs/source/validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 4b00f18..864dde5 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -1,4 +1,4 @@ -validators +validation ========== validators features user-side mave dataset validators functions From 574f370e3c3320fa6566cf2597c70d9b1c483f64 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:07:32 -0800 Subject: [PATCH 013/877] combine import statements --- mavecore/validation/variant_validators/dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 3a01f92..0d621a9 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -15,9 +15,10 @@ hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, - required_score_column - # constants - , + required_score_column, + null_values_list, + null_values_re, + readable_null_values ) from mavecore.validation.constants import NA_value, null_values_list, null_values_re, readable_null_values From 13ca91f77a46fc08b4e1d0c8e4bce51472e5ac8c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:07:45 -0800 Subject: [PATCH 014/877] remove import statement --- mavecore/validation/variant_validators/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 0d621a9..65122c9 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -21,8 +21,6 @@ readable_null_values ) -from mavecore.validation.constants import NA_value, null_values_list, null_values_re, readable_null_values - def is_null(value): """Returns True if a stripped/lowercase value in in `nan_col_values`.""" From 245625aaec504907b3864e0b485165c44b85d129 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:08:10 -0800 Subject: [PATCH 015/877] re-format is_null docstring --- mavecore/validation/variant_validators/dataset.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 65122c9..f83d8b4 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -23,7 +23,17 @@ def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : + + Returns + _______ + bool + """ value = str(value).strip().lower() return null_values_re.fullmatch(value) or not value From ccade56a728fe054cd012a17ced43fdc42142685 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:08:39 -0800 Subject: [PATCH 016/877] Add space for MaveDataset class docstring --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index f83d8b4..508f598 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -39,6 +39,9 @@ def is_null(value): class MaveDataset: + """ + + """ class DatasetType: SCORES = "scores" COUNTS = "counts" From 8b8a14c9cf45a56aaccc965a09507e367d58ccd6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:08:55 -0800 Subject: [PATCH 017/877] Add space for DatasetType class docstring --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 508f598..c0b1d2c 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -43,6 +43,9 @@ class MaveDataset: """ class DatasetType: + """ + + """ SCORES = "scores" COUNTS = "counts" From 8927ec96c515ada918a6281fed879829c642ef52 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:09:17 -0800 Subject: [PATCH 018/877] Add space for HGVSColumns class docstring --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index c0b1d2c..fd70524 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -50,6 +50,9 @@ class DatasetType: COUNTS = "counts" class HGVSColumns: + """ + + """ NUCLEOTIDE: str = hgvs_nt_column TRANSCRIPT: str = hgvs_splice_column PROTEIN: str = hgvs_pro_column From 68038267ca4affe583fae695c2bba3bf4f2decf6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:09:56 -0800 Subject: [PATCH 019/877] Format space for options function docstring --- mavecore/validation/variant_validators/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index fd70524..4cf2aa3 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -59,6 +59,11 @@ class HGVSColumns: @classmethod def options(cls) -> List[str]: + """ + + Returns + _______ + """ return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: From 37a11f9fdbae34dd1b928dd158cebc7502711c04 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:10:15 -0800 Subject: [PATCH 020/877] Add space for AdditionalColumns class docstring --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 4cf2aa3..8f282fd 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -67,6 +67,9 @@ def options(cls) -> List[str]: return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: + """ + + """ @classmethod def options(cls) -> List[str]: return [] From 3743b2f3acdd9254a13572655e0855a5d27c9f52 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:11:11 -0800 Subject: [PATCH 021/877] edit parameter type in docstring for validate_variant_json function --- mavecore/validation/variant_validators/variant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index 5a2d967..9c6dbf2 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -35,7 +35,7 @@ def validate_variant_json(data: Dict[str, Dict]) -> None: Parameters ---------- - data : dict + data : dict[str, dict] Dictionary of keys mapping to a list. """ expected_keys = [variant_score_data, variant_count_data] From f4bee9bb8bba62a3df206a1ed5254ef085c3a75e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:11:44 -0800 Subject: [PATCH 022/877] add raises section in docstring for validate_variant_json function --- mavecore/validation/variant_validators/variant.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index 9c6dbf2..5294f63 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -37,6 +37,17 @@ def validate_variant_json(data: Dict[str, Dict]) -> None: ---------- data : dict[str, dict] Dictionary of keys mapping to a list. + + Raises + ______ + ValidationError + If missing the required key. + ValidationError + If missing the required column in variant's score data. + ValidationError + If encountered unexpected keys. + ValidationError + If value for key is not of type dict. """ expected_keys = [variant_score_data, variant_count_data] for key in expected_keys: From 9da5333cc16de6011a74011373f3f3fd0abdb311 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:22:29 -0800 Subject: [PATCH 023/877] add return type to options function --- mavecore/validation/variant_validators/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 8f282fd..6fa2343 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -63,6 +63,7 @@ def options(cls) -> List[str]: Returns _______ + List[str] """ return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] From c07802df32f9c73e3d80e5b4edfb690c57666263 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:23:02 -0800 Subject: [PATCH 024/877] start docsting and add return type to options function in AdditionalColumns class --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 6fa2343..a17503e 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -73,6 +73,12 @@ class AdditionalColumns: """ @classmethod def options(cls) -> List[str]: + """ + + Returns + _______ + List[str] + """ return [] # ---------------------- Construction------------------------------------ # From c8eab9f461d909c904f04e78aa6f8fc7e1c2dd8a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:23:45 -0800 Subject: [PATCH 025/877] start docstring and add parameters and returns to for_scores function --- mavecore/validation/variant_validators/dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index a17503e..6f3ae29 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -84,6 +84,17 @@ def options(cls) -> List[str]: # ---------------------- Construction------------------------------------ # @classmethod def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + + Returns + _______ + `MaveScoresDataset` + + """ return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) @classmethod From 0773e11005e24efdd9395e32f8945becd627ee40 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:23:55 -0800 Subject: [PATCH 026/877] start docstring and add parameters and returns to for_counts function --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 6f3ae29..3270cad 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -99,6 +99,16 @@ def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": @classmethod def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + + Returns + _______ + `MaveCountsDataset` + """ return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) @classmethod From e0f77c12c9cd926a42257676210d1c8b5e8e1548 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:24:24 -0800 Subject: [PATCH 027/877] create docstring and add parameters to _for_type function --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 3270cad..d717c6f 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -115,6 +115,12 @@ def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": def _for_type( cls, file: Union[str, TextIO, BinaryIO], dataset_type: str ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + dataset_type : str if isinstance(file, str): handle = file From 1d6d1afaa9295ba43a5871665805b09151e26a94 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:24:55 -0800 Subject: [PATCH 028/877] add return types and error types to _for_type docstring --- mavecore/validation/variant_validators/dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index d717c6f..ea3a0b7 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -122,6 +122,17 @@ def _for_type( file : Union[str, TextIO, BinaryIO] dataset_type : str + Returns + _______ + Union[`MaveScoreDataset`, `MaveCountsDataset`] + + Raises + ______ + TypeError + If file parameter is not expected file path or buffer object. + ValueError + If dataset_type parameter is not a recognized dataset type. + """ if isinstance(file, str): handle = file elif hasattr(file, "read"): From d453b9de2101bfd247e488af4c41cf2b5a0d4d3e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:25:21 -0800 Subject: [PATCH 029/877] start docstring for label function and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index ea3a0b7..84b676d 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -177,6 +177,12 @@ def _for_type( # ---------------------- Public ----------------------------------------- # @property def label(self) -> str: + """ + + Returns + _______ + str + """ return "dataset" @property From 1a026d2e5bae73e39f38dcdc08f7c17f6ecaca8d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:25:33 -0800 Subject: [PATCH 030/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 84b676d..8f2ebf5 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -187,6 +187,12 @@ def label(self) -> str: @property def is_valid(self) -> Optional[bool]: + """ + + Returns + _______ + Optional[bool] + """ if self._errors is None: return None return len(self._errors) == 0 From 36e84ec5023172c92b2957d20a71ad6cdfca4602 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:25:38 -0800 Subject: [PATCH 031/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 8f2ebf5..fc5684a 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -199,6 +199,12 @@ def is_valid(self) -> Optional[bool]: @property def n_errors(self) -> Optional[int]: + """ + + Returns + _______ + Optional[int] + """ if self._errors is None: return None return len(self._errors) From b38aec41eb8f420e2c2f3c345590461fe5783da8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:25:42 -0800 Subject: [PATCH 032/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index fc5684a..ed94c51 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -211,6 +211,12 @@ def n_errors(self) -> Optional[int]: @property def errors(self) -> Optional[List[str]]: + """ + + Returns + _______ + Optional[List[str]] + """ return self._errors @property From 79124f59ab4262f87dc292c6dc4dd27cc9a9b553 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:25:53 -0800 Subject: [PATCH 033/877] start docstring for is_empty and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index ed94c51..6ca273c 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -221,6 +221,12 @@ def errors(self) -> Optional[List[str]]: @property def is_empty(self) -> bool: + """ + + Returns + _______ + bool + """ return self._df.empty @property From d3f20ced326b1fad9776fefab9f122748292cfd8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:06 -0800 Subject: [PATCH 034/877] start docstring for columns function and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 6ca273c..3abe233 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -231,6 +231,12 @@ def is_empty(self) -> bool: @property def columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ return list(self._df.columns) @property From 4c83e2551b3e0d34b66012146fcce260b4b46fd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:14 -0800 Subject: [PATCH 035/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 3abe233..70e7fb2 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -241,6 +241,12 @@ def columns(self) -> List[str]: @property def hgvs_columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ return [c for c in self.columns if c in self.HGVSColumns.options()] @property From cc364fcfec184ea4fd4fe33e1605038a7d14df67 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:17 -0800 Subject: [PATCH 036/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 70e7fb2..e8e12bc 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -251,6 +251,12 @@ def hgvs_columns(self) -> List[str]: @property def non_hgvs_columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ return [c for c in self.columns if c not in self.HGVSColumns.options()] @property From f89ce16789cdbecb90edc5ce8ac751ab3f8b9bba Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:22 -0800 Subject: [PATCH 037/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index e8e12bc..5403e88 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -261,6 +261,12 @@ def non_hgvs_columns(self) -> List[str]: @property def n_rows(self) -> int: + """ + + Returns + _______ + int + """ return len(self._df) @property From 6689e55e074f964fc6b4ae96705dffa2a9a6a80a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:25 -0800 Subject: [PATCH 038/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 5403e88..7543695 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -271,6 +271,12 @@ def n_rows(self) -> int: @property def n_columns(self) -> int: + """ + + Returns + _______ + int + """ return len(self.columns) @property From be5351ba47e75a151b9527114699bc3a58816c18 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:29 -0800 Subject: [PATCH 039/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 7543695..0e6f371 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -281,6 +281,12 @@ def n_columns(self) -> int: @property def index_column(self) -> Optional[str]: + """ + + Returns + _______ + Optional[str] + """ if self._errors: return None return self._index_column From a8f19a4427f3d5f3c88759fba1f603fb6be8c4f8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:34 -0800 Subject: [PATCH 040/877] start docstring and add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 0e6f371..0b8c1c9 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -293,6 +293,12 @@ def index_column(self) -> Optional[str]: @property def index(self) -> Optional[pd.Index]: + """ + + Returns + _______ + Optional[`pd.Index`] + """ if self._errors: return None return self._df.index.copy(deep=True) From 5532b37c10bf09c4f6b1b7c9cd4ab55f235021cc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:26:52 -0800 Subject: [PATCH 041/877] add return type to data function --- mavecore/validation/variant_validators/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 0b8c1c9..4a17971 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -311,6 +311,11 @@ def data(self, serializable=False) -> pd.DataFrame: ---------- serializable: bool Replaces `np.NaN` with `None` for JSON compatibility. + + Returns + _______ + `pd.DataFrame` + """ if serializable: # need to force "object" type to allow None values From c63b865e8a860f92d5d8bfdee8b7e0e87154f0b8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:27:04 -0800 Subject: [PATCH 042/877] add return type to match_other function --- mavecore/validation/variant_validators/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 4a17971..bcc8ce0 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -335,8 +335,9 @@ def match_other(self, other: "MaveDataset") -> Optional[bool]: Returns ------- - A boolean indicating index match, otherwise `None` if either instance - is not valid. + Optional[bool] + A boolean indicating index match, otherwise `None` if either instance + is not valid. """ if (not self.is_valid) or (not other.is_valid): return None From 617b952442e0ef272127d94aaab2de73587402f9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:27:16 -0800 Subject: [PATCH 043/877] add return type to to_dict function --- mavecore/validation/variant_validators/dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index bcc8ce0..692f1d0 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -355,6 +355,10 @@ def to_dict(self) -> Dict[str, Dict]: Returns underlying dataframe as dictionary in 'records' orientation. Keys will be index values and values will be an inner dictionary mapping column names to row values for said index. + + Returns + _______ + Dict[str, Dict] """ # Convert np.NaN values to None for consistency across all columns and # for compatibility in PostgresSQL queries. Replaces all values which From 97e3c2250917da0f89ad4c9a3361b882818fd9bc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:28:10 -0800 Subject: [PATCH 044/877] start docstring for validate function and add parameters, and return types --- .../validation/variant_validators/dataset.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 692f1d0..8b53373 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -372,6 +372,22 @@ def validate( relaxed_ordering: bool = False, allow_index_duplicates: bool = False, ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + allow_index_duplicates : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ self._errors = [] self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) From dcaa5d507b025d23c62693f38643215ae15459c2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:29:22 -0800 Subject: [PATCH 045/877] start constructor docstring for MaveDataset class and add parameters --- mavecore/validation/variant_validators/dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 8b53373..d0d2efa 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -426,6 +426,17 @@ def __init__( index_column: Optional[str] = None, errors: Optional[List[str]] = None, ): + """ + + Parameters + df : + index_column : + errors : + + Raises + ______ + + """ self._df: pd.DataFrame = pd.DataFrame() if df is None else df self._index_column = index_column or None self._errors = None if errors is None else list(errors) From 8384984e66e8f6137bf22b073889d899ffd43831 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:40:03 -0800 Subject: [PATCH 046/877] start docstring --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index d0d2efa..7e470f4 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -442,6 +442,12 @@ def __init__( self._errors = None if errors is None else list(errors) def __repr__(self): + """ + + Returns + _______ + + """ return ( f"<" f"{self.__class__.__name__} " From 800bbeb61d6ab0e720ac54e13df7dbcca1ff7cb8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:40:20 -0800 Subject: [PATCH 047/877] start docstring add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 7e470f4..d3e7822 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -459,6 +459,12 @@ def __repr__(self): @property def _column_order(self) -> Dict[str, int]: + """ + + Returns + _______ + Dict[str, int] + """ return defaultdict( lambda: 100, { From c3c82b41462874cf9d4104a0d5f7ba25a22d5567 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:40:25 -0800 Subject: [PATCH 048/877] start docstring add return type --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index d3e7822..fef8d06 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -479,6 +479,16 @@ def _column_order(self) -> Dict[str, int]: ) def _validate_columns(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ if self._errors: return self From 94ebe1cadf2ebed7b2d2f782e6f9b468881a0ffb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:40:30 -0800 Subject: [PATCH 049/877] start docstring add return type --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index fef8d06..6137bf2 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -530,6 +530,12 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + """ if self._errors: return self From aefe6b25c9f3cdb7f599372904c0507518666c44 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:00 -0800 Subject: [PATCH 050/877] start docstring add parameters and return type to _validate_genomic_variants --- mavecore/validation/variant_validators/dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 6137bf2..1e1a3f2 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -553,6 +553,21 @@ def _normalize_data(self) -> "MaveDataset": def _validate_genomic_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): return self From 64d3005e9d4412b58149fcc1845d1ceef708010c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:17 -0800 Subject: [PATCH 051/877] start docstring add parameters and return type to _validate_transcript_variants --- mavecore/validation/variant_validators/dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 1e1a3f2..3b7eeeb 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -606,6 +606,21 @@ def _validate_genomic_variants( def _validate_transcript_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) From a095c90e15882bddba0a1db3404fad85c0519c25 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:28 -0800 Subject: [PATCH 052/877] start docstring add parameters and return type to _validate_protein_variants --- mavecore/validation/variant_validators/dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 3b7eeeb..f318aa3 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -652,6 +652,21 @@ def _validate_transcript_variants( def _validate_protein_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ if self._column_is_null(self.HGVSColumns.PROTEIN): return self From fa412666b6c339edde4921b1dc98573a0b6941c4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:44 -0800 Subject: [PATCH 053/877] start docstring add parameters and return type to _validate_index_column --- mavecore/validation/variant_validators/dataset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index f318aa3..4f6bd46 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -703,6 +703,20 @@ def _validate_protein_variants( return self def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": + """ + + Parameters + __________ + allow_duplicates : bool + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ if self._errors: return self From 9115b9ac26d7fc75cf94e8b5f35b4a5afdac80d8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:58 -0800 Subject: [PATCH 054/877] start docstring add parameters and return type to _validate_variants --- .../validation/variant_validators/dataset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 4f6bd46..94343cd 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -754,6 +754,23 @@ def _validate_variants( targetseq: Optional[str] = None, relaxed_ordering: bool = False, ) -> Tuple[pd.Series, Set[str], List[str]]: + """ + + Parameters + __________ + column : str + splice_defined : Optional[bool] + targetseq : Optional[str] + relaxed_ordering : bool + + Returns + _______ + Tuple[`pd.Series`, Set[str], List[str]] + + Raises + ______ + + """ prefixes = set() errors = [] From 51ab2e4c52c0c75ee247bf1f04753e0ef59bd216 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:42:11 -0800 Subject: [PATCH 055/877] start docstring add parameters and return type to _column is null --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 94343cd..b3477d0 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -823,6 +823,16 @@ def validate_variant(variant: str): return validated_variants, prefixes, errors def _column_is_null(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ return len(self._df[self._df[column].isna()]) == len(self._df) def _column_is_partially_null(self, column) -> bool: From 4386bd0a1ccd7f6333a0e1fee53232922e506383 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:42:35 -0800 Subject: [PATCH 056/877] start docstring add parameters and return type to _column_is_partially_null --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index b3477d0..1dff2f3 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -836,6 +836,16 @@ def _column_is_null(self, column) -> bool: return len(self._df[self._df[column].isna()]) == len(self._df) def _column_is_partially_null(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ return 0 < len(self._df[self._df[column].isna()]) < len(self._df) def _column_is_fully_specified(self, column) -> bool: From b8c7ad8360c18dfbab3c6842210854fb9f6d94ff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:42:49 -0800 Subject: [PATCH 057/877] start docstring add parameters and return type to _column_is_fully_specified --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 1dff2f3..382bf8e 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -849,6 +849,16 @@ def _column_is_partially_null(self, column) -> bool: return 0 < len(self._df[self._df[column].isna()]) < len(self._df) def _column_is_fully_specified(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ return len(self._df[self._df[column].isna()]) == 0 def _validate_variant_prefix_for_column( From b35d7bc5b7b0b534707e4434668cc0da795e8cf0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:43:26 -0800 Subject: [PATCH 058/877] start docstring and add parameters, returns and errors to function _column_is_fully_specified --- .../validation/variant_validators/dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 382bf8e..1307e96 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -864,6 +864,24 @@ def _column_is_fully_specified(self, column) -> bool: def _validate_variant_prefix_for_column( self, variant: Variant, prefix: str, column: str, splice_defined: bool ) -> Optional[str]: + """ + + Parameters + __________ + variant : Variant + prefix : str + column : str + splice_defined : bool + + Returns + _______ + Optional[str] + + Raises + ______ + ValueError + If there is an unknown column as column argument. + """ prefix = prefix.lower() if column == self.HGVSColumns.NUCLEOTIDE: From 63accb2944295410176b788dc15d135d5847bfdd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:43:43 -0800 Subject: [PATCH 059/877] start docstring for MaveScoresDataset --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 1307e96..a10ea80 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -927,6 +927,9 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): + """ + + """ class AdditionalColumns: SCORES = required_score_column From dd3b1d6f607ebecf539de7e6ae9bcb7a4ea62aea Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:44:01 -0800 Subject: [PATCH 060/877] start docstring for AdditionalColumns class --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index a10ea80..8dc1eae 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -931,6 +931,9 @@ class MaveScoresDataset(MaveDataset): """ class AdditionalColumns: + """ + + """ SCORES = required_score_column @classmethod From 1367bfe1c9fccb59e277bfb464be987baf41fbae Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:44:18 -0800 Subject: [PATCH 061/877] start docstring and add return type to options function --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 8dc1eae..68595ed 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -938,6 +938,12 @@ class AdditionalColumns: @classmethod def options(cls) -> List[str]: + """ + + Returns + _______ + List[str] + """ return [cls.SCORES] @property From 0d5c607eebe95e5aa1aaa44f54422008698c1955 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:44:25 -0800 Subject: [PATCH 062/877] start docstring and add return type to label function --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 68595ed..045b66a 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -948,6 +948,12 @@ def options(cls) -> List[str]: @property def label(self) -> str: + """ + + Returns + _______ + str + """ return "scores" def _validate_columns(self) -> "MaveDataset": From 6d523266d332a4d818d17e169959326c6aaa0fff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:44:44 -0800 Subject: [PATCH 063/877] start docstring and add return type to _validate_columns function --- mavecore/validation/variant_validators/dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 045b66a..51c304f 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -957,6 +957,16 @@ def label(self) -> str: return "scores" def _validate_columns(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ super()._validate_columns() if self.AdditionalColumns.SCORES not in self.columns: From 8ab6e9495c96ca7320ab1eaccdd3339ff0268ebe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:45:14 -0800 Subject: [PATCH 064/877] start docstring and add return type and errors to _normalize_data function --- mavecore/validation/variant_validators/dataset.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 51c304f..8459698 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -979,6 +979,17 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + ValueError + + """ super()._normalize_data() should_be_numeric = [self.AdditionalColumns.SCORES] From 0705b909613e97056feeab53a9958825b8cd7632 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:45:29 -0800 Subject: [PATCH 065/877] start docstring for MaveCountsDataset class --- mavecore/validation/variant_validators/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 8459698..cd2c9eb 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -1004,6 +1004,9 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): + """ + + """ @property def label(self) -> str: return "counts" From 23fa88ab125ef838a1c0b075005a7ba2c8c1bdc6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:45:48 -0800 Subject: [PATCH 066/877] start docstring and add return type to label function --- mavecore/validation/variant_validators/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index cd2c9eb..404ecd2 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -1009,4 +1009,10 @@ class MaveCountsDataset(MaveDataset): """ @property def label(self) -> str: + """ + + Returns + _______ + str + """ return "counts" From 413e666574617ea45ae66ab5288453df4548c30e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:51:41 -0800 Subject: [PATCH 067/877] re-format docstring --- mavecore/validation/dataset_validators.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7a72e0e..2ce0e8f 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -8,7 +8,17 @@ def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value + + Returns + _______ + + """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value From 589f9b5c47df90e6616f172783def40d4f04226c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:51:59 -0800 Subject: [PATCH 068/877] add docstring for WordLimitValidator --- mavecore/validation/dataset_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 2ce0e8f..48411f6 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -24,6 +24,9 @@ def is_null(value): class WordLimitValidator: + """ + + """ message = "This field is limited to {} words." code = "invalid" counter = re.compile(r"\w+\b", flags=re.IGNORECASE) From 69055f4ac01cae6a2e0cde9f09630b9957dc86d2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:52:13 -0800 Subject: [PATCH 069/877] add docstring for WordLimitValidator constructor, include parameters --- mavecore/validation/dataset_validators.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 48411f6..ff19d01 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -32,6 +32,14 @@ class WordLimitValidator: counter = re.compile(r"\w+\b", flags=re.IGNORECASE) def __init__(self, word_limit, message=None, code=None): + """ + + Parameters + __________ + word_limit : + message : + code : + """ if message is not None: self.message = message if code is not None: From fd6f2378566d9681ecc2f712a244e9e3b57bd7af Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:52:27 -0800 Subject: [PATCH 070/877] add docstring --- mavecore/validation/dataset_validators.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ff19d01..d308cf6 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -47,6 +47,19 @@ def __init__(self, word_limit, message=None, code=None): self.word_limit = int(word_limit) def __call__(self, value): + """ + Parameters + __________ + value : + + Returns + _______ + + Raises + ______ + ValueError + If + """ if not value: return if len(self.counter.findall(value)) > self.word_limit: From db65864d57effa6d699fbf77e792c26d2dbe719e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:52:57 -0800 Subject: [PATCH 071/877] write docstring for read_header_from_io, include parameters and errors --- mavecore/validation/dataset_validators.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index d308cf6..93252d4 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -67,6 +67,23 @@ def __call__(self, value): def read_header_from_io(file, label=None, msg=None): + """ + Parameters + __________ + file : + label : + default = None + msg : + default = None + + Returns + _______ + + Raises + ______ + ValueError + If + """ if label is None: label = "uploaded" From 920049711711a18ded383fe4275f09a16c21813f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:53:19 -0800 Subject: [PATCH 072/877] write docstring for validate_has_hgvs_in_header, include parameters and errors --- mavecore/validation/dataset_validators.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 93252d4..9a9c814 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -105,6 +105,20 @@ def read_header_from_io(file, label=None, msg=None): def validate_has_hgvs_in_header(header, label=None, msg=None): + """ + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValueError + If + """ if label is None: label = "Uploaded" params = {} From c609a2d79cdb2a48523ba3b2f29d84fa671c9879 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:53:49 -0800 Subject: [PATCH 073/877] begin docstring for validate_has_at_least_one_additional_column, include parameters and errors --- mavecore/validation/dataset_validators.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 9a9c814..e5e13e8 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -138,6 +138,19 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): def validate_at_least_one_additional_column(header, label=None, msg=None): + """ + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValueError + """ if label is None: label = "Uploaded" params = {} From 1d6f6bc919d0108db5467598411187110c49c2c8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:54:31 -0800 Subject: [PATCH 074/877] begin docstring for validate_header_contains_no_null_columns, include parameters and errors --- mavecore/validation/dataset_validators.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index e5e13e8..6570173 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -170,6 +170,17 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): def validate_header_contains_no_null_columns(header, label=None, msg=None): + """ + Parameters + __________ + header : + label : + msg : + + Raises + ______ + ValueError + """ if label is None: label = "File" any_null = any([is_null(v) for v in header]) From 8398909488bf5ebccdbf4b63bd18729a036da465 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:55:03 -0800 Subject: [PATCH 075/877] add errors to validate_datasets_define_dame_variants docstring --- mavecore/validation/dataset_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 6570173..97a09f4 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -205,6 +205,11 @@ def validate_datasets_define_same_variants(scores, counts): Scores dataframe parsed from an uploaded scores file. counts : `pd.DataFrame` Scores dataframe parsed from an uploaded counts file. + + Raises + ______ + ValueError + If score and counts files do not define the same variants. """ try: assert_array_equal( From 5324a2ccf80ae3ded903e4da87b26de2df79d702 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:55:24 -0800 Subject: [PATCH 076/877] add errors to validate_scoreset_score_data_input docstring --- mavecore/validation/dataset_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 97a09f4..d1fbee2 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -241,6 +241,11 @@ def validate_scoreset_score_data_input(file): ---------- file : :class:`io.FileIO` An open file handle in read mode. + + Raises + ______ + ValueError + If score data file is missing the required column constants.required_score_column """ file.seek(0) header = read_header_from_io(file, label="Score") From 952960bb7d29e0dec324e9a50dd02e1b34141b0c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:55:42 -0800 Subject: [PATCH 077/877] add errors to validate_scoreset_json docstring --- mavecore/validation/dataset_validators.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index d1fbee2..c65f33d 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -289,6 +289,19 @@ def validate_scoreset_json(dict_): ---------- dict_ : dict Dictionary of keys mapping to a list. + + Raises + ______ + ValueError + If scoreset data is missing the required key. + ValueError + If header values are not strings. + ValueError + If + ValueError + If missing required column constants.required_score_column for score dataset. + ValueError + If encountered unexpected keys extras. """ required_columns = [constants.score_columns, constants.count_columns] From 169c0f06011fa4dba79e4252064927fc337a83e2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:56:22 -0800 Subject: [PATCH 078/877] edit automodule for dataset_validators.rst --- docs/source/dataset_validators.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 docs/source/dataset_validators.rst diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst new file mode 100644 index 0000000..11155cd --- /dev/null +++ b/docs/source/dataset_validators.rst @@ -0,0 +1,10 @@ +validation +========== + +validators features user-side mave dataset validators functions +that replicate some of the server-side validation done in MaveDB. + +.. automodule:: mavecore.validation.dataset_validators + :members: +.. automodule:: mavecore.validation.genome_validators + :members: From c1524c768836ccb1d08e7e1547747635982ccb44 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:56:34 -0800 Subject: [PATCH 079/877] re-format docstring --- mavecore/validation/genome_validators.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 996eb41..5047648 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -17,7 +17,17 @@ def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : + + Returns + _______ + + """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value From 76951f05b66676e08e9cccc7e11d63eae9791200 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:56:54 -0800 Subject: [PATCH 080/877] begin docstring for sequence Type class --- mavecore/validation/genome_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5047648..d987fce 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -55,6 +55,9 @@ class WildTypeSequence: """ class SequenceType: + """ + + """ DNA = "dna" PROTEIN = "protein" INFER = "infer" From 0083c1943623ec4e0003272d0d534e85bf875743 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:57:19 -0800 Subject: [PATCH 081/877] begin docstring for detect_sequence_type function --- mavecore/validation/genome_validators.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index d987fce..c469723 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -64,6 +64,20 @@ class SequenceType: @classmethod def detect_sequence_type(cls, sequence): + """ + + Parameters + __________ + sequence : + + Returns + _______ + + Raises + ______ + ValueError + If sequence parameter is not protein or DNA + """ if sequence_is_dna(sequence): return cls.DNA elif sequence_is_protein(sequence): From ea6b05d86155f06c0564e05530f96a1a987658d5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:57:50 -0800 Subject: [PATCH 082/877] re-format docstring --- mavecore/validation/variant_validators/hgvs.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 6e157e9..b31c551 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -16,7 +16,17 @@ # from core.utilities import is_null def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value + + Returns + _______ + + """ value = str(value).strip().lower() return null_values_re.fullmatch(value) or not value From 34be3204f173557fdb90913380eda4f085e816a1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:58:15 -0800 Subject: [PATCH 083/877] add docstring, include parameters, returns, and errors --- .../validation/variant_validators/hgvs.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index b31c551..5f19735 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -38,6 +38,38 @@ def validate_hgvs_string( targetseq: Optional[str] = None, relaxed_ordering: bool = False, ) -> Optional[str]: + """ + + Parameters + __________ + value : + column : + splice_present : + targetseq : + relaxed_ordering : + + Returns + _______ + + Raises + ______ + ValidationError + If variant HGVS input values are not strings. + ValidationError + If value is _sy or _wt, which are no longer supported. + ValidationError + If + ValidationError + If value is not a genomic variant (prefix 'g.'). Nucleotide variants must + be genomic if transcript variants are also defined. + ValidationError + If value is not a transcript variant. The accepted transcript variant + prefixes are 'c.', 'n.'. + ValidationError + If value is not a protein variant. The accepted protein variant prefix is 'p.'. + ValueError + If there exists an unknown column. Function expects nt, splice or p." + """ if is_null(value): return None From 5e284ae107ab14d475343ff461ad4d4de0a67ac9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:37:40 -0800 Subject: [PATCH 084/877] add docstring to is_protein method --- docs/source/index.rst | 2 +- mavecore/validation/genome_validators.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index e9311c6..c3eded2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,7 +20,7 @@ Building a local copy of the documentation requires the following additional pac :maxdepth: 2 :caption: Contents: - validators + validation Indices and tables diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index c469723..ab3b984 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -89,6 +89,16 @@ def detect_sequence_type(cls, sequence): @classmethod def is_protein(cls, value): + """ + + Parameters + __________ + value : + + Returns + _______ + + """ return value == cls.PROTEIN @classmethod From ea6de5738e02625588cbfb3117286614240a6a73 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:37:50 -0800 Subject: [PATCH 085/877] add docstring to is_dna method --- mavecore/validation/genome_validators.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index ab3b984..1ce0da8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -103,6 +103,16 @@ def is_protein(cls, value): @classmethod def is_dna(cls, value): + """ + + Parameters + __________ + value : + + Returns + _______ + + """ return value == cls.DNA @classmethod From 59259c53522a0a9539cb70db40f27d7304d5f126 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:37:59 -0800 Subject: [PATCH 086/877] add docstring to choices method --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 1ce0da8..7394c72 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -117,6 +117,11 @@ def is_dna(cls, value): @classmethod def choices(cls): + """ + + Returns + _______ + """ return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] class Meta: From 07ba976bf7604e533126116d4e128915f977029a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:38:16 -0800 Subject: [PATCH 087/877] start docstring form Meta class --- mavecore/validation/genome_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 7394c72..efa9bd8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -125,6 +125,9 @@ def choices(cls): return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] class Meta: + """ + + """ verbose_name = "Reference sequence" verbose_name_plural = "Reference sequences" From 6253bf0c9fb4c5236623f88eec0c5013532127f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:38:37 -0800 Subject: [PATCH 088/877] start docstring for __str__ method within Meta class --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index efa9bd8..46c8775 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -132,6 +132,12 @@ class Meta: verbose_name_plural = "Reference sequences" def __str__(self): + """ + + Returns + _______ + + """ return self.get_sequence() # sequence = models.TextField( From ef76e124968b16c1adcd6631e10d7ab3ac5f29ba Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:38:57 -0800 Subject: [PATCH 089/877] start docstring for is_dna method within Meta class --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 46c8775..8f9adfc 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -158,6 +158,12 @@ def __str__(self): @property def is_dna(self): + """ + + Returns + _______ + + """ return self.__class__.SequenceType.is_dna(self.sequence_type) @property From 378219dff877b01efc539b5ea44d291c1ecf078f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:39:06 -0800 Subject: [PATCH 090/877] start docstring for is_protein method within Meta class --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 8f9adfc..d1a0d23 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -168,6 +168,12 @@ def is_dna(self): @property def is_protein(self): + """ + + Returns + _______ + + """ return self.__class__.SequenceType.is_protein(self.sequence_type) def save(self, *args, **kwargs): From 0bfb490bf42073639f33d329724a3b7d20471d93 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:39:18 -0800 Subject: [PATCH 091/877] start docstring for save method within Meta class --- mavecore/validation/genome_validators.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index d1a0d23..44ddde8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -177,6 +177,17 @@ def is_protein(self): return self.__class__.SequenceType.is_protein(self.sequence_type) def save(self, *args, **kwargs): + """ + + Parameters + __________ + args : + kwargs : + + Returns + _______ + + """ if self.sequence is not None: self.sequence = self.sequence.upper() self.sequence_type = ( From 3e1b35fbd0609e6040d7098091aefe784179f47f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:39:33 -0800 Subject: [PATCH 092/877] start docstring for get_sequence method --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 44ddde8..dff06dd 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -199,6 +199,12 @@ def save(self, *args, **kwargs): return super().save(*args, **kwargs) def get_sequence(self): + """ + + Returns + _______ + + """ return self.sequence.upper() def is_attached(self): From 8185a46d3006f110d0279781b76c5ffac8d24bb8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:39:40 -0800 Subject: [PATCH 093/877] start docstring for is_attached method --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index dff06dd..88ec873 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -208,6 +208,12 @@ def get_sequence(self): return self.sequence.upper() def is_attached(self): + """ + + Returns + _______ + + """ return getattr(self, "target", None) is not None From 509d44f796e4687423428f2deb6e5be48958b25f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:40:31 -0800 Subject: [PATCH 094/877] start docstring for validate_interval_start_lteq_end method, include parameters, returns, and errors --- mavecore/validation/genome_validators.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 88ec873..2cf58d6 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -220,6 +220,21 @@ def is_attached(self): # GenomicInterval # ------------------------------------------------------------------------- # def validate_interval_start_lteq_end(start, end): + """ + + Parameters + __________ + start : + end : + + Returns + _______ + + Raises + ______ + ValidationError + If an interval's starting coordinate is greater than the ending coordinate. + """ # Intervals may be underspecified, but will be ignored so skip validation. if start is None or end is None: return From a1b33d78d7e55f744b827bf0a92a94ac51821373 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:40:46 -0800 Subject: [PATCH 095/877] start docstring for validate strand method, include parameters and errors --- mavecore/validation/genome_validators.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 2cf58d6..5b3fcb0 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -248,6 +248,17 @@ def validate_interval_start_lteq_end(start, end): def validate_strand(value): + """ + + Parameters + __________ + value : + + Raises + ______ + ValidationError + If GenomicInterval strand is not positive or negative. + """ if value not in ("+", "-"): raise ValidationError("GenomicInterval strand must be either '+' or '-'") From 947d9e04e6d0352dfa1e6364431c81fff05a71db Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:51:17 -0800 Subject: [PATCH 096/877] add un-formatted docstring to validate_chromosome --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5b3fcb0..8b3026f 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -264,6 +264,11 @@ def validate_strand(value): def validate_chromosome(value): + """ + + :param value: + :return: + """ # Intervals may be underspecified, but will be ignored so skip validation. if value is None: return From be58d152eac85631ceebd9ed1f5b54078df104f0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:51:31 -0800 Subject: [PATCH 097/877] add un-formatted docstring to validate_unique_intervals --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 8b3026f..7146f57 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -277,6 +277,11 @@ def validate_chromosome(value): def validate_unique_intervals(intervals): + """ + + :param intervals: + :return: + """ for interval1 in intervals: for interval2 in intervals: if ( From 5a27e2199140c1947dc6166d3283243a5eb57918 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:51:45 -0800 Subject: [PATCH 098/877] add un-formatted docstring to validate_wildtype_sequence --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 7146f57..c7e7fba 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -299,6 +299,12 @@ def validate_unique_intervals(intervals): # WildTypeSequence # ------------------------------------------------------------------------- # def validate_wildtype_sequence(seq, as_type="any"): + """ + + :param seq: + :param as_type: + :return: + """ # from .models import WildTypeSequence # Explicitly check for these cases as they are also valid AA sequences. From 41184fa3ae34958519d7f68cdb82706d97a9a8c2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:51:55 -0800 Subject: [PATCH 099/877] add un-formatted docstring to sequence_is_dna --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index c7e7fba..ee4125f 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -337,6 +337,11 @@ def validate_wildtype_sequence(seq, as_type="any"): def sequence_is_dna(seq): + """ + + :param seq: + :return: + """ # Explicitly check for these cases as they are also valid AA sequences. if is_null(seq): return False From 26576c1eb40b93ca9f2f527afd006f88cecee4f0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:52:07 -0800 Subject: [PATCH 100/877] add un-formatted docstring to sequence is protein --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index ee4125f..5b13953 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -350,6 +350,11 @@ def sequence_is_dna(seq): def sequence_is_protein(seq): + """ + + :param seq: + :return: + """ # Explicitly check for these cases as they are also valid AA sequences. if is_null(seq): return False From bb5a9bc77882016c7e255c73bf4b54140c9a22ce Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:52:21 -0800 Subject: [PATCH 101/877] add un-formatted docstring to validate_organism_name --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5b13953..f71724e 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -367,6 +367,11 @@ def sequence_is_protein(seq): # ReferenceGenome # ------------------------------------------------------------------------- # def validate_organism_name(value): + """ + + :param value: + :return: + """ if is_null(value): raise ValidationError("Species name must not be null.") From 8fdfc91a7eef710b5d3f89b1beb716b25f90561d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:52:53 -0800 Subject: [PATCH 102/877] add un-formatted docstring to validate_reference_genome_has_one_external_identifier --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index f71724e..3a3a03d 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -377,6 +377,11 @@ def validate_organism_name(value): def validate_reference_genome_has_one_external_identifier(referencegenome): + """ + + :param referencegenome: + :return: + """ if not referencegenome.genome_id: raise ValidationError( "Only one external identifier can be specified for a reference" "genome." From 14beed6ab3dd6462be5c3a1130459ca4ad1037f8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:53:07 -0800 Subject: [PATCH 103/877] validate_genome_short_name --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 3a3a03d..078ad9b 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -389,6 +389,11 @@ def validate_reference_genome_has_one_external_identifier(referencegenome): def validate_genome_short_name(value): + """ + + :param value: + :return: + """ if is_null(value): raise ValidationError("Genome short name must not be null.") From f83a13ad6446c3169d9e3d8849479e79252923b7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:53:40 -0800 Subject: [PATCH 104/877] add un-formatted docstring to validate_map_has_unique_reference_genome --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 078ad9b..1240f77 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -401,6 +401,11 @@ def validate_genome_short_name(value): # ReferenceMap # ------------------------------------------------------------------------- # def validate_map_has_unique_reference_genome(annotations): + """ + + :param annotations: + :return: + """ genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) if len(genomes) < len(annotations): raise ValidationError( From 82bbb35d8da95877697906c6bfdfb5d3cfb231f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:53:55 -0800 Subject: [PATCH 105/877] add un-formatted docstring to validate_map_has_at_least_one_interval --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 1240f77..2d75c74 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -414,6 +414,11 @@ def validate_map_has_unique_reference_genome(annotations): def validate_map_has_at_least_one_interval(reference_map): + """ + + :param reference_map: + :return: + """ if not reference_map.get_intervals().count(): raise ValidationError( "You must specify at least one interval for each reference map." From d734e777cc07d4a7d48d753a7a93c8fdf91dc5f6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:54:07 -0800 Subject: [PATCH 106/877] add un-formatted docstring to validate_at_least_one_map --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 2d75c74..feccd90 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -426,6 +426,11 @@ def validate_map_has_at_least_one_interval(reference_map): def validate_at_least_one_map(reference_maps): + """ + + :param reference_maps: + :return: + """ if not len(reference_maps): raise ValidationError( "A target must have at least one reference map specified." From 6a15427296ae6eae2a40c41df27b1503f3783c1c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:54:18 -0800 Subject: [PATCH 107/877] add un-formatted docstring to validate_one_primary_map --- mavecore/validation/genome_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index feccd90..5ed044a 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -438,6 +438,11 @@ def validate_at_least_one_map(reference_maps): def validate_one_primary_map(reference_maps): + """ + + :param reference_maps: + :return: + """ primary_count = sum(a.is_primary_reference_map() for a in reference_maps) if primary_count > 1 or primary_count < 1: raise ValidationError("A target must have one primary reference map.") From 99479ca0946a053b13b3242afa2eca42aae5da85 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:54:56 -0800 Subject: [PATCH 108/877] add docstring to validate_gene_map, including description, parameters, and errors --- mavecore/validation/genome_validators.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5ed044a..3a290aa 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -451,5 +451,17 @@ def validate_one_primary_map(reference_maps): # TargetGene # ------------------------------------------------------------------------- # def validate_gene_name(value): + """ + This function checks to see if a gene name is null and raises and error if it is. + + Parameters + __________ + value : + The gene name. + Raises + ______ + ValidationError + If gene name (value parameter) is null. + """ if is_null(value): raise ValidationError("Gene name must not be null.") From 885524df65025a025870b826ca5586b676a6b45c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:55:21 -0800 Subject: [PATCH 109/877] reformat docstring for validate_gene_map --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 3a290aa..301f92c 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -458,6 +458,7 @@ def validate_gene_name(value): __________ value : The gene name. + Raises ______ ValidationError From e0fefe50cae09606405a02c89ef36f15ec97640d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:56:03 -0800 Subject: [PATCH 110/877] reformat docstring for validate_columns_match, add parameters, and errors --- mavecore/validation/variant_validators/variant.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index 5294f63..68e3849 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -12,6 +12,18 @@ def validate_columns_match(variant, scoreset) -> None: """ Validate that a child matches parents defined columns to keep data in sync. + + Parameters + __________ + variant : + scoreset : + + Raises + ______ + ValidationError + If variant score columns do not match scoreset score columns. + ValidationError + If variant count columns do not match scoreset count columns. """ try: if variant.score_columns != scoreset.score_columns: From e38afea0345ac562dcc12c179cc74270255be024 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 14:00:45 -0800 Subject: [PATCH 111/877] add description to validate_one_primary_map docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 301f92c..a612a6d 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -439,6 +439,8 @@ def validate_at_least_one_map(reference_maps): def validate_one_primary_map(reference_maps): """ + This function validates the existence of one primary reference map and raises an error + if it does not exist. :param reference_maps: :return: From 05fc224b45314240dd1ebda2dc34d07f130d516a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 2 Mar 2022 14:01:21 -0800 Subject: [PATCH 112/877] reformat parameters and add errors to validate_one_primary_map docstring --- mavecore/validation/genome_validators.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index a612a6d..5e6fe88 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -442,8 +442,14 @@ def validate_one_primary_map(reference_maps): This function validates the existence of one primary reference map and raises an error if it does not exist. - :param reference_maps: - :return: + Parameters + __________ + reference_maps : + + Raises + ______ + ValidationError + If target has less than or more than one primary reference map. """ primary_count = sum(a.is_primary_reference_map() for a in reference_maps) if primary_count > 1 or primary_count < 1: From 9b0f65edd72200140d205120090a3788017079f3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:47:21 -0800 Subject: [PATCH 113/877] edit description of is_null function --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5e6fe88..b2743a4 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -18,7 +18,7 @@ def is_null(value): """ - Returns True if a stripped/lowercase value in in `nan_col_values`. + This function checks if the value exists or is null. Parameters __________ From 023a8941448cdc71768305b9d66f01426ae1a0cd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:47:40 -0800 Subject: [PATCH 114/877] add description to parameter --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index b2743a4..a33916c 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -23,6 +23,7 @@ def is_null(value): Parameters __________ value : + The value to be checked. Returns _______ From 0a82d86e504408e16080ffbba69a0d22618ddeb0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:47:55 -0800 Subject: [PATCH 115/877] add return type and description --- mavecore/validation/genome_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index a33916c..c524f92 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -27,7 +27,8 @@ def is_null(value): Returns _______ - + bool + True if a stripped/lowercase value in in `nan_col_values`. """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value From 76a390fc6410f6be250c4c8c7cc52f61032d33f5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:48:05 -0800 Subject: [PATCH 116/877] reformat --- mavecore/validation/genome_validators.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index c524f92..abd257b 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -410,9 +410,7 @@ def validate_map_has_unique_reference_genome(annotations): """ genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) + raise ValidationError("Each reference map must specify a different reference genome.") def validate_map_has_at_least_one_interval(reference_map): From e66481617cfd00cd405d85fc904023124d7a33c1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:48:23 -0800 Subject: [PATCH 117/877] add description --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index abd257b..5f198c8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -415,6 +415,8 @@ def validate_map_has_unique_reference_genome(annotations): def validate_map_has_at_least_one_interval(reference_map): """ + This function validates that a reference map has at least one interval and raises an error + if this is not the case. :param reference_map: :return: From 97b68bf8e019b17dbde495594dd4240abacedae8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:48:46 -0800 Subject: [PATCH 118/877] reformat paramters and errors and add descriptions --- mavecore/validation/genome_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5f198c8..bbed2cf 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -418,8 +418,15 @@ def validate_map_has_at_least_one_interval(reference_map): This function validates that a reference map has at least one interval and raises an error if this is not the case. - :param reference_map: - :return: + Parameters + __________ + reference_map : + Reference map. + + Raises + ______ + ValidationError + If the reference_map does not have at least one interval. """ if not reference_map.get_intervals().count(): raise ValidationError( From ea0af282d6d95f0d2873baa1e6293046d375e13b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:48:57 -0800 Subject: [PATCH 119/877] reformat --- mavecore/validation/genome_validators.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index bbed2cf..1480371 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -429,9 +429,7 @@ def validate_map_has_at_least_one_interval(reference_map): If the reference_map does not have at least one interval. """ if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) + raise ValidationError("You must specify at least one interval for each reference map.") def validate_at_least_one_map(reference_maps): From 0e5075978597a67b2a716dc3451cf86520081f7d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:49:13 -0800 Subject: [PATCH 120/877] add description and parameters --- mavecore/validation/genome_validators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 1480371..76d32bd 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -434,6 +434,12 @@ def validate_map_has_at_least_one_interval(reference_map): def validate_at_least_one_map(reference_maps): """ + This function validates whether a target has at least one reference map specified + and raises an error if it does not. + + Parameters + __________ + reference_maps : :param reference_maps: :return: From df91f350dc786ec0c11718568a45cd9b8a2531a7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:49:51 -0800 Subject: [PATCH 121/877] add errors in docstring and include description --- mavecore/validation/genome_validators.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 76d32bd..e27df45 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -441,8 +441,11 @@ def validate_at_least_one_map(reference_maps): __________ reference_maps : - :param reference_maps: - :return: + + Raises + ______ + ValidationError + If the target does not have at least one reference map specified. """ if not len(reference_maps): raise ValidationError( From 79af8cd2ea1835ceb8fe857a818ad6c3682a5e78 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 10:49:59 -0800 Subject: [PATCH 122/877] reformat --- mavecore/validation/genome_validators.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index e27df45..9bba56c 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -448,9 +448,7 @@ def validate_at_least_one_map(reference_maps): If the target does not have at least one reference map specified. """ if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) + raise ValidationError("A target must have at least one reference map specified.") def validate_one_primary_map(reference_maps): From 58a955bacc4f69e07449becc940db8db11742d67 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:53:24 -0800 Subject: [PATCH 123/877] remove unused import --- mavecore/validation/genome_validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 9bba56c..f9fa9eb 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -9,7 +9,6 @@ Most validation should validate one specific field, unless fields need to be validated against each other. """ -import re from fqfa.validator.validator import dna_bases_validator, amino_acids_validator from mavecore.validation.exceptions import ValidationError From ef48689e41b435f7dac127903ec1a15b018e3e18 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:53:39 -0800 Subject: [PATCH 124/877] correct typo --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index f9fa9eb..8792ff5 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -27,7 +27,7 @@ def is_null(value): Returns _______ bool - True if a stripped/lowercase value in in `nan_col_values`. + True if a stripped/lowercase value in `nan_col_values`. """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value From 1d750565ab8a38cbcc56e4e3ec0faa2220731d99 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:54:09 -0800 Subject: [PATCH 125/877] add description to validate_interval_start_lteq_end docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 8792ff5..42b1758 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -222,6 +222,8 @@ def is_attached(self): # ------------------------------------------------------------------------- # def validate_interval_start_lteq_end(start, end): """ + This function validates whether or not an interval's starting coordinate is less than + or equal to that interval's ending coordinate. Parameters __________ From 501e7902f3c40c45f9045fe49ca7cf3cb5913d04 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:54:49 -0800 Subject: [PATCH 126/877] add tpes and descriptions to parameters for validate_interval_start_lteq_end function docstring --- mavecore/validation/genome_validators.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 42b1758..52315bc 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -227,8 +227,10 @@ def validate_interval_start_lteq_end(start, end): Parameters __________ - start : - end : + start : int + The interval's starting coordinate. + end : int + The interval's ending coordinate. Returns _______ From e0ec6b6b104ca9265ba9d41cfb06e3fde56efa69 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 14:55:05 -0800 Subject: [PATCH 127/877] add types and descriptions to errors for validate_interval_start_lteq_end function docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 52315bc..1e91f8e 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -234,6 +234,8 @@ def validate_interval_start_lteq_end(start, end): Returns _______ + None + If start is NoneType or end is NoneType. Raises ______ From 7284b6aa6c3b007049b769e1b812d37f295b9681 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:25:36 -0800 Subject: [PATCH 128/877] add TODO, find the type of value --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 1e91f8e..84379f6 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -255,6 +255,8 @@ def validate_interval_start_lteq_end(start, end): def validate_strand(value): + # TODO + # find the type of value """ Parameters From 302ebf4ef32695199034c1306be59242cd487279 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:25:53 -0800 Subject: [PATCH 129/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 84379f6..6f14d2e 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -258,6 +258,7 @@ def validate_strand(value): # TODO # find the type of value """ + This function validates a GenomicInterval strand and raises an error if the strand is invalid. Parameters __________ From 9c54f9c157ed074a16537fefa7fcc9748a69c0df Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:26:07 -0800 Subject: [PATCH 130/877] add description to value parameter of docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 6f14d2e..b4bd603 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -263,6 +263,7 @@ def validate_strand(value): Parameters __________ value : + The Genomic Interval strand to be validated. Raises ______ From 813ad0d668c77dc41107c21b8cc5b4d6d29cb424 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:26:35 -0800 Subject: [PATCH 131/877] add TODO, add description and type for value parameter --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index b4bd603..d0e7641 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -275,6 +275,8 @@ def validate_strand(value): def validate_chromosome(value): + # TODO + # add description and type for value parameter """ :param value: From c66c4d43c159011f8dbf2b136833619c251205bf Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:27:12 -0800 Subject: [PATCH 132/877] restructure docstring, add parameters, returns, and raises sections --- mavecore/validation/genome_validators.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index d0e7641..0e50c21 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -279,8 +279,19 @@ def validate_chromosome(value): # add description and type for value parameter """ - :param value: - :return: + Parameters + __________ + value : + + Returns + _______ + None + If value is NoneType. + + Raises + ______ + ValidationError + If chromosome identifier is null. """ # Intervals may be underspecified, but will be ignored so skip validation. if value is None: From 78543f13349bb43f1b556448fb4abd5897fa64a6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:27:40 -0800 Subject: [PATCH 133/877] add TODO, add description and interval parameter type plus description --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 0e50c21..a053b65 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -301,6 +301,8 @@ def validate_chromosome(value): def validate_unique_intervals(intervals): + # TODO + # add description and interval parameter type plus description """ :param intervals: From edf60c62a7b864340aebd4db6c1917c42c2965c3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:28:02 -0800 Subject: [PATCH 134/877] restructure docstring, add parameter and raises sections --- mavecore/validation/genome_validators.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index a053b65..f0c8616 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -305,8 +305,14 @@ def validate_unique_intervals(intervals): # add description and interval parameter type plus description """ - :param intervals: - :return: + Parameters + __________ + intervals : + + Raises + ______ + ValidationError + If the same interval was specified twice. """ for interval1 in intervals: for interval2 in intervals: From 042d3262faaef75853479b5691afd63602832abb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:28:27 -0800 Subject: [PATCH 135/877] add TODO, add description and seq type plus description --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index f0c8616..bc2807e 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -331,6 +331,8 @@ def validate_unique_intervals(intervals): # WildTypeSequence # ------------------------------------------------------------------------- # def validate_wildtype_sequence(seq, as_type="any"): + # TODO + # add description and seq type plus description """ :param seq: From 06d99d3a2873eab59f6b3b8d0696d00c06125c64 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:29:01 -0800 Subject: [PATCH 136/877] restructure docstring, add parameters and raises sections --- mavecore/validation/genome_validators.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index bc2807e..d5d9964 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -335,9 +335,18 @@ def validate_wildtype_sequence(seq, as_type="any"): # add description and seq type plus description """ - :param seq: - :param as_type: - :return: + Parameters + __________ + seq : + as_type : str + (default = "any") + + Raises + ______ + ValidationError + If seq is not a valid wild type sequence. + ValidationError + If seq is not a valid DNA or protein reference sequence. """ # from .models import WildTypeSequence From b16778685e1b4261f85ce215a8687c4d6e694a46 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:29:13 -0800 Subject: [PATCH 137/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index d5d9964..28341cb 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -381,6 +381,7 @@ def validate_wildtype_sequence(seq, as_type="any"): def sequence_is_dna(seq): """ + This function checks if seq is a DNA sequence. :param seq: :return: From d6741072535296183380c34479b3ba17cb7c64f4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:29:37 -0800 Subject: [PATCH 138/877] restructure docstring, add parameters and returns sections --- mavecore/validation/genome_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 28341cb..8b923f1 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -383,8 +383,15 @@ def sequence_is_dna(seq): """ This function checks if seq is a DNA sequence. - :param seq: - :return: + Parameters + __________ + seq : str + The sequence to be validated. + + Returns + _______ + bool + True if the dna_bases_validator returns a match object. """ # Explicitly check for these cases as they are also valid AA sequences. if is_null(seq): From 2f55b6179c3d2f039085ed24f393f4f002100272 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:38:15 -0800 Subject: [PATCH 139/877] revise TODO note --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 8b923f1..116d5cb 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -332,7 +332,7 @@ def validate_unique_intervals(intervals): # ------------------------------------------------------------------------- # def validate_wildtype_sequence(seq, as_type="any"): # TODO - # add description and seq type plus description + # add description to as_type parameter """ Parameters From 99fb5a8a240a1763f7590f3962c82710a321de4e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:38:25 -0800 Subject: [PATCH 140/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 116d5cb..e5b5174 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -334,6 +334,7 @@ def validate_wildtype_sequence(seq, as_type="any"): # TODO # add description to as_type parameter """ + This function checks whether or not seq is a wildtype sequence. Parameters __________ From 4a23549f73e998fc1e6339962df3d2e6253f48ef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:38:50 -0800 Subject: [PATCH 141/877] add type and description to seq parameter in docstring --- mavecore/validation/genome_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index e5b5174..476e618 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -338,7 +338,8 @@ def validate_wildtype_sequence(seq, as_type="any"): Parameters __________ - seq : + seq : str + The sequence being validated. as_type : str (default = "any") From e3e1db969ffa24442a1dd2d22a8d14145741c706 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:39:02 -0800 Subject: [PATCH 142/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 476e618..a3d80cd 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -404,6 +404,7 @@ def sequence_is_dna(seq): def sequence_is_protein(seq): """ + This function check if seq is a protein sequence. :param seq: :return: From 2fd44b082d52b59d7f204a74c7dcb352d8cd1131 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 15:39:27 -0800 Subject: [PATCH 143/877] reformat docstring, add parameters and returns sections --- mavecore/validation/genome_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index a3d80cd..16741e8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -406,8 +406,15 @@ def sequence_is_protein(seq): """ This function check if seq is a protein sequence. - :param seq: - :return: + Parameters + __________ + seq : str + The sequence being validated. + + Returns + _______ + bool + True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. """ # Explicitly check for these cases as they are also valid AA sequences. if is_null(seq): From 8d1d4ca1a5fab0abaa2f899737db108eb833ba46 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:01:25 -0800 Subject: [PATCH 144/877] add TODO, confirm organism_name type --- mavecore/validation/genome_validators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 16741e8..de96503 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -427,7 +427,9 @@ def sequence_is_protein(seq): # ReferenceGenome # ------------------------------------------------------------------------- # -def validate_organism_name(value): +def validate_organism_name(organism_name): + # TODO + # confirm organism_name type """ :param value: From 2364c7529d8e21f98d637711ec51e0fa57efe323 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:01:38 -0800 Subject: [PATCH 145/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index de96503..5710b32 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -431,6 +431,7 @@ def validate_organism_name(organism_name): # TODO # confirm organism_name type """ + This function validates the organism name by checking that the name is not null. :param value: :return: From 1df4b96b5ff51595312c9bd7ccbff39951533c6f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:01:58 -0800 Subject: [PATCH 146/877] reformat docstring, add parameters and raises sections --- mavecore/validation/genome_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 5710b32..917fdba 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -433,8 +433,15 @@ def validate_organism_name(organism_name): """ This function validates the organism name by checking that the name is not null. - :param value: - :return: + Parameters + __________ + organism_name : str + The organism name to be validated. + + Raises + ______ + ValidationError + If the organism name is null. """ if is_null(value): raise ValidationError("Species name must not be null.") From 54177c8c1114919e6cfc0cc2a2a66d734fcf808d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:02:23 -0800 Subject: [PATCH 147/877] give parameter more meaningful name --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 917fdba..42a00d9 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -443,7 +443,7 @@ def validate_organism_name(organism_name): ValidationError If the organism name is null. """ - if is_null(value): + if is_null(organism_name): raise ValidationError("Species name must not be null.") From b1ab46007deb3f57bba872e4812c91d69e3bedab Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:02:43 -0800 Subject: [PATCH 148/877] add TODO, function may have bug --- mavecore/validation/genome_validators.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 42a00d9..18d3334 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -448,6 +448,10 @@ def validate_organism_name(organism_name): def validate_reference_genome_has_one_external_identifier(referencegenome): + # TODO + # revise description, make sure it is accurate + # anything greater than 0 will return True, so should it be == 1 or > 0? + # determine what type referencegenome is """ :param referencegenome: From bbc947dfaf098d2a6971be1ac112eae7dc09308c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:03:04 -0800 Subject: [PATCH 149/877] add partial function description to docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 18d3334..890cbb4 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -453,6 +453,8 @@ def validate_reference_genome_has_one_external_identifier(referencegenome): # anything greater than 0 will return True, so should it be == 1 or > 0? # determine what type referencegenome is """ + This function validates whether or not the reference genome has one external identifier. + An error is raised if :param referencegenome: :return: From 01ddc07619220d10527ad361ec1a58f62f66dd53 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:03:27 -0800 Subject: [PATCH 150/877] reformat docstring, add parameters and raises section --- mavecore/validation/genome_validators.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 890cbb4..52b3213 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -456,8 +456,14 @@ def validate_reference_genome_has_one_external_identifier(referencegenome): This function validates whether or not the reference genome has one external identifier. An error is raised if - :param referencegenome: - :return: + Parameters + __________ + referencegenome : + + Raises + ______ + ValidationError + If """ if not referencegenome.genome_id: raise ValidationError( From f30940279392ded8d60361abc9eb320c1e0c94f0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:03:55 -0800 Subject: [PATCH 151/877] add TODO, confirm the type of the value parameter --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 52b3213..e48413b 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -472,6 +472,8 @@ def validate_reference_genome_has_one_external_identifier(referencegenome): def validate_genome_short_name(value): + # TODO + # confirm the type of the value parameter """ :param value: From 9c7808e9e355a7b4381bafb1574c8d8364898616 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:04:22 -0800 Subject: [PATCH 152/877] add description to docstring --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index e48413b..72b3e54 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -475,6 +475,7 @@ def validate_genome_short_name(value): # TODO # confirm the type of the value parameter """ + This function validates the genome short name and raises an error if the value is null. :param value: :return: From 13cd844d37e0269fd4f7d55fa87e25eb9775d32b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:04:46 -0800 Subject: [PATCH 153/877] reformat docstring, add parameters and raises section --- mavecore/validation/genome_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 72b3e54..c3a4c86 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -477,8 +477,15 @@ def validate_genome_short_name(value): """ This function validates the genome short name and raises an error if the value is null. - :param value: - :return: + Parameters + __________ + value : str + The genome short name to be validated. + + Raises + ______ + ValidationError + If the genome short name is null. """ if is_null(value): raise ValidationError("Genome short name must not be null.") From db8854a22df14687fc99e5407cc5e96f2124c23f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:05:19 -0800 Subject: [PATCH 154/877] add TODO, check the type of annotations parameter and add description to said parameter --- mavecore/validation/genome_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index c3a4c86..a6146c8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -494,6 +494,9 @@ def validate_genome_short_name(value): # ReferenceMap # ------------------------------------------------------------------------- # def validate_map_has_unique_reference_genome(annotations): + # TODO + # check the type of annotations + # add description to annotations parameter """ :param annotations: From 6a07f2099eb912e279b555525cca1db2cfcb54bb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:05:37 -0800 Subject: [PATCH 155/877] add description to docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index a6146c8..55bd375 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -498,6 +498,8 @@ def validate_map_has_unique_reference_genome(annotations): # check the type of annotations # add description to annotations parameter """ + This function validates whether or not each map in annotations has a + unique reference genome and raises an error if this is not the case. :param annotations: :return: From 1fca1e3baa4462b6c8a8eeccc690bb382d8cd855 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:05:59 -0800 Subject: [PATCH 156/877] add parameters and raises section to docstring --- mavecore/validation/genome_validators.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 55bd375..046bd80 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -501,8 +501,14 @@ def validate_map_has_unique_reference_genome(annotations): This function validates whether or not each map in annotations has a unique reference genome and raises an error if this is not the case. - :param annotations: - :return: + Parameters + __________ + annotations : + + Raises + ______ + ValidationError + If each reference map does not specify a different reference genome. """ genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) if len(genomes) < len(annotations): From acec6deb25cb906305f047f6ebfd9cddb099fe15 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:06:36 -0800 Subject: [PATCH 157/877] give parameter more meaningful name, add TODO, confirm gene_name parameter type --- mavecore/validation/genome_validators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 046bd80..dc5bf7f 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -574,7 +574,9 @@ def validate_one_primary_map(reference_maps): # TargetGene # ------------------------------------------------------------------------- # -def validate_gene_name(value): +def validate_gene_name(gene_name): + # TODO + # confirm gene_name type """ This function checks to see if a gene name is null and raises and error if it is. From aa75ebe5488c2ba730f0fa850f621648a915370c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:06:58 -0800 Subject: [PATCH 158/877] give temp type to gene_name parameter in docstring --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index dc5bf7f..51e69d0 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -582,7 +582,7 @@ def validate_gene_name(gene_name): Parameters __________ - value : + gene_name : str The gene name. Raises From f01ad5af6ea50a6459aaae893a1fdf40ae445bb3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:07:18 -0800 Subject: [PATCH 159/877] give parameter more meaningful name --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 51e69d0..e4cfafe 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -590,5 +590,5 @@ def validate_gene_name(gene_name): ValidationError If gene name (value parameter) is null. """ - if is_null(value): + if is_null(gene_name): raise ValidationError("Gene name must not be null.") From 4aae580afda17d5ca9270c3ac52bc852d5f7b636 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:42:56 -0800 Subject: [PATCH 160/877] add TODO, confirm sequence parameter type --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index e4cfafe..f571485 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -65,6 +65,8 @@ class SequenceType: @classmethod def detect_sequence_type(cls, sequence): + # TODO + # confirm sequence parameter type """ Parameters From e5a275279e70eeca3e35ea649c67e7650d2795b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:43:38 -0800 Subject: [PATCH 161/877] add description to docstring --- mavecore/validation/genome_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index f571485..0256ada 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -68,6 +68,9 @@ def detect_sequence_type(cls, sequence): # TODO # confirm sequence parameter type """ + This function determines if the sequence is a DNA or protein sequence and + returns "dna" if it is DNA or "protein" if it is protein. An error is raised + if it is neither. Parameters __________ From 71b4e7638cc26a94cbd6bac641c0a0625c8e593c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:43:51 -0800 Subject: [PATCH 162/877] add type to sequence parameter in docstring --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 0256ada..91dd7eb 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -74,7 +74,7 @@ def detect_sequence_type(cls, sequence): Parameters __________ - sequence : + sequence : str Returns _______ From f48ac6d61c1d80f36ccb7cc19827ea48d988b155 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:44:09 -0800 Subject: [PATCH 163/877] add return type and description to docstring --- mavecore/validation/genome_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index 91dd7eb..ad9adb0 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -78,6 +78,8 @@ def detect_sequence_type(cls, sequence): Returns _______ + str + "dna" or "protein" depending on if the sequence is a DNA or protein sequence. Raises ______ From a1756f35bebdb5e0e30a911cdae391513d0f00e4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 16:44:20 -0800 Subject: [PATCH 164/877] add period --- mavecore/validation/genome_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index ad9adb0..ba4845d 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -84,7 +84,7 @@ def detect_sequence_type(cls, sequence): Raises ______ ValueError - If sequence parameter is not protein or DNA + If sequence parameter is not protein or DNA. """ if sequence_is_dna(sequence): return cls.DNA From 932a60d9f773f6348209635711ae2d8c1f483c44 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 17:05:42 -0800 Subject: [PATCH 165/877] reformat docstring, add parameters and returns sections, add TODO, check that parameter type is correct --- mavecore/validation/metadata_validators.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 92bff1b..ce7e1b8 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -6,7 +6,21 @@ def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" + # TODO + # check that parameter type is accurate + """ + This function checks that the passed value is null. + + Parameters + __________ + value : str + Value to be checked if null. + + Returns + _______ + bool + True if a stripped/lowercase value in in `nan_col_values`. + """ value = str(value).strip().lower() return null_values_re.fullmatch(value) or not value From 5bcac4d95ee637770f4a1e7b81dd8992e8fee1ac Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 17:05:50 -0800 Subject: [PATCH 166/877] remove unused import --- mavecore/validation/metadata_validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index ce7e1b8..d5bbc97 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,4 +1,3 @@ -import re import idutils from mavecore.validation.exceptions import ValidationError From 926f1849141f678e2e9b7d4c38cc1bd1165b9bcd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 17:06:02 -0800 Subject: [PATCH 167/877] add complete docstring --- mavecore/validation/metadata_validators.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index d5bbc97..6725091 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -39,6 +39,21 @@ def validate_sra_identifier(identifier): def validate_keyword(kw): + """ + This function validates whether or not the kw parameter is valid by + checking that it is a string that is not null. If kw is null + or is not a string, an error is raised. + + Parameters + __________ + kw : str + The keyword to be validated. + + Raises + ______ + ValidationError + If the kw argument is not a valid string. + """ if is_null(kw) or not isinstance(kw, str): raise ValidationError( f"'{kw}' not a valid keyword. Keywords must be valid strings." From 9b42325e47c5bdf28b0504bd1b071ae7ef503431 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 3 Mar 2022 17:06:07 -0800 Subject: [PATCH 168/877] add complete docstring --- mavecore/validation/metadata_validators.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 6725091..8bc6568 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -93,6 +93,16 @@ def validate_genome_identifier(identifier): def validate_keyword_list(values): + """ + This function takes a list of keyword values and validates that each one is valid. + A valid keyword is a non-null string. The validate_keyword function will raise an + ValidationError if any of the keywords are invalid. + + Parameters + __________ + values : list[str] + The list of values to be validated. + """ for value in values: if not is_null(value): validate_keyword(value) From 17492174f5c51a55fd37f95fd92f003fe97766f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:05:43 -0800 Subject: [PATCH 169/877] add parameter type and description --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index c65f33d..ff09517 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -13,7 +13,8 @@ def is_null(value): Parameters __________ - value + value : str + The value to be checked as null or not. Returns _______ From ccdc4cfa4bbe0927616bdfba9aa68d058f68216e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:05:51 -0800 Subject: [PATCH 170/877] add return type and description --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ff09517..2f56e47 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -18,7 +18,8 @@ def is_null(value): Returns _______ - + bool + True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value From 535c2a244068238dd800ee9ee02bb7f9161698bc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:06:02 -0800 Subject: [PATCH 171/877] begin docstring --- mavecore/validation/dataset_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 2f56e47..cbd8ea6 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -27,6 +27,7 @@ def is_null(value): class WordLimitValidator: """ + This class """ message = "This field is limited to {} words." From c3a3f0ecb0d52f08cedc001f22ff85127ffadea3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:06:27 -0800 Subject: [PATCH 172/877] add TODO, verify code parameter type --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index cbd8ea6..5e21d4e 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -35,6 +35,8 @@ class WordLimitValidator: counter = re.compile(r"\w+\b", flags=re.IGNORECASE) def __init__(self, word_limit, message=None, code=None): + # TODO + # check the code parameter type """ Parameters From fe3de8d3e04588442c4cb949ebf16cf5fe583dd4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:06:38 -0800 Subject: [PATCH 173/877] add description to docstring --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 5e21d4e..c9241bb 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -38,6 +38,8 @@ def __init__(self, word_limit, message=None, code=None): # TODO # check the code parameter type """ + This constructor sets the values of the WordLimitValidator class attributes + message, code, and counter. Parameters __________ From 0dcf1b4aa4c47d01260d4dc3a76a6ba33193faec Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:07:02 -0800 Subject: [PATCH 174/877] add parameter types and descriptions to docstring --- mavecore/validation/dataset_validators.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index c9241bb..c3a289f 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -43,9 +43,12 @@ def __init__(self, word_limit, message=None, code=None): Parameters __________ - word_limit : - message : + word_limit : int + The word limit assigned to the word limit attribute. + message : str + (default = None) The message assigned to the message attribute. code : + (default = None) The code assigned to the code attribute. """ if message is not None: self.message = message From fbd1a5c35a68f214d73f9ed0e96c51d270e28d80 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:07:22 -0800 Subject: [PATCH 175/877] add TODO, confirm parameter types --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index c3a289f..8b3d816 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -77,6 +77,8 @@ def __call__(self, value): def read_header_from_io(file, label=None, msg=None): + # TODO + # confirm types for parameters """ Parameters __________ From 03e03ecbddd2a2111a500105e4de120cf22c29a8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:07:32 -0800 Subject: [PATCH 176/877] add description to docstring --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 8b3d816..c664ecf 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -80,6 +80,8 @@ def read_header_from_io(file, label=None, msg=None): # TODO # confirm types for parameters """ + This takes a file and reads the header from that file. + Parameters __________ file : From 94f60a2fe7bdca0a4853f765e411aa998aeff5b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:07:58 -0800 Subject: [PATCH 177/877] add types and descriptions to parameters in docstring --- mavecore/validation/dataset_validators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index c664ecf..f46c2f3 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -85,10 +85,10 @@ def read_header_from_io(file, label=None, msg=None): Parameters __________ file : - label : - default = None - msg : - default = None + label : str + (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. Returns _______ From 232a5c056c10b21a29812d69d69314811b111164 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:08:07 -0800 Subject: [PATCH 178/877] add types and descriptions to returns in docstring --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index f46c2f3..18bf5dc 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -92,6 +92,8 @@ def read_header_from_io(file, label=None, msg=None): Returns _______ + str + The header that was read from io. Raises ______ From 49f8764115c29ef29b79f356b47287db69208fae Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:08:21 -0800 Subject: [PATCH 179/877] add error description to docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 18bf5dc..7c67d8b 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -98,7 +98,8 @@ def read_header_from_io(file, label=None, msg=None): Raises ______ ValueError - If + If a header could not be parsed from file. Columns must be coma delimited. Column names + with commas must be escaped by enclosing them in double quotes. """ if label is None: label = "uploaded" From 2d5fe07935f35dfda3f7612e572647e99ff6b01b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:08:43 -0800 Subject: [PATCH 180/877] add TODO, verify parameter type --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7c67d8b..4baaf2e 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -155,6 +155,8 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): def validate_at_least_one_additional_column(header, label=None, msg=None): + # TODO + # verify parameter types """ Parameters __________ From e09196aee4122407d3f2c9e8215f2e5f4011b73d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:08:57 -0800 Subject: [PATCH 181/877] add description to docstring --- mavecore/validation/dataset_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 4baaf2e..da8e4e4 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -158,6 +158,9 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): # TODO # verify parameter types """ + This function checks the passed header to see if there exists additional columns besides the three + specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. + Parameters __________ header : From 5e9ba8d9352fe51827b2728d617ce798b271bf9b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:09:10 -0800 Subject: [PATCH 182/877] add description to errors in docstring --- mavecore/validation/dataset_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index da8e4e4..ba71459 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -172,6 +172,7 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): Raises ______ ValueError + If there are not additional columns in the header argument. """ if label is None: label = "Uploaded" From be8693263979cc170537db5d035c3f99000ee5ec Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:09:26 -0800 Subject: [PATCH 183/877] add description to docstring --- mavecore/validation/dataset_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ba71459..5496027 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -194,6 +194,9 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): def validate_header_contains_no_null_columns(header, label=None, msg=None): """ + This function checks that the header parameter does not contain any null columns that + are not in the case-insensitive null values listed in constants.readable_null_values. + Parameters __________ header : From 0fe98d7833b0709d59dc3d5ce9df8c5abed308fd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:09:45 -0800 Subject: [PATCH 184/877] add default values to parameters in docstring --- mavecore/validation/dataset_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 5496027..ea45269 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -201,6 +201,7 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): __________ header : label : + (default = None) msg : Raises From 8e3913d272c8d871aa21a7ae9f217034e74f967c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:09:48 -0800 Subject: [PATCH 185/877] add default values to parameters in docstring --- mavecore/validation/dataset_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ea45269..81c4aa2 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -203,6 +203,7 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): label : (default = None) msg : + (default = None) Raises ______ From 36141a2775e1a52617bddc7727a9807413ac679f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:10:00 -0800 Subject: [PATCH 186/877] add error description to docstring --- mavecore/validation/dataset_validators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 81c4aa2..03a3d6c 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -208,6 +208,9 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): Raises ______ ValueError + If the file header contains blank/empty/whitespace. Only columns or the + case-insensitive null values listed in constants.readable_null_values + are permitted. """ if label is None: label = "File" From b9dfa42cc2bee36d2ed4ea9011e3811ae4809eee Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:10:39 -0800 Subject: [PATCH 187/877] start docstring, un-formatted --- mavecore/validation/metadata_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 8bc6568..927a2da 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -61,6 +61,11 @@ def validate_keyword(kw): def validate_pubmed_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_pmid(identifier): raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") From 94bcbfd3401e03546ea948ff89839aa5c8c16618 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:10:43 -0800 Subject: [PATCH 188/877] start docstring, un-formatted --- mavecore/validation/metadata_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 927a2da..a207bdc 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -71,6 +71,11 @@ def validate_pubmed_identifier(identifier): def validate_doi_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_doi(identifier): raise ValidationError(f"'{identifier}' is not a valid DOI.") From 290f621dea598c834cbef0b1168e30d0aed87716 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:10:46 -0800 Subject: [PATCH 189/877] start docstring, un-formatted --- mavecore/validation/metadata_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index a207bdc..db7776c 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -81,6 +81,11 @@ def validate_doi_identifier(identifier): def validate_ensembl_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_ensembl(identifier): raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") From 411d384472eca6ec9b60c0da1a1f84a79fb16b38 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:10:49 -0800 Subject: [PATCH 190/877] start docstring, un-formatted --- mavecore/validation/metadata_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index db7776c..68e975b 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -91,6 +91,11 @@ def validate_ensembl_identifier(identifier): def validate_uniprot_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_uniprot(identifier): raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") From 84b65662dfe1793075034d926cd3193fff90d4c5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:11:07 -0800 Subject: [PATCH 191/877] start docstring, un-formatted --- mavecore/validation/metadata_validators.py | 39 ++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 68e975b..3c9d5d1 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -101,11 +101,21 @@ def validate_uniprot_identifier(identifier): def validate_refseq_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_refseq(identifier): raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") def validate_genome_identifier(identifier): + """ + + :param identifier: + :return: + """ if not idutils.is_genome(identifier): raise ValidationError( f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." @@ -129,36 +139,65 @@ def validate_keyword_list(values): def validate_pubmed_list(values): + """ + :param values: + :return: + """ for value in values: if not is_null(value): validate_pubmed_identifier(value) def validate_sra_list(values): + """ + + :param values: + :return: + """ for value in values: if not is_null(value): validate_sra_identifier(value) def validate_doi_list(values): + """ + + :param values: + :return: + """ for value in values: if not is_null(value): validate_doi_identifier(value) def validate_ensembl_list(values): + """ + + :param values: + :return: + """ for value in values: if not is_null(value): validate_ensembl_identifier(value) def validate_refseq_list(values): + """ + + :param values: + :return: + """ for value in values: if not is_null(value): validate_refseq_identifier(value) def validate_uniprot_list(values): + """ + + :param values: + :return: + """ for value in values: if not is_null(value): validate_uniprot_identifier(value) From 309e29de75c458160d446d5578a6f1c910677197 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:11:23 -0800 Subject: [PATCH 192/877] start docstrings, un-formatted --- mavecore/validation/urn_validators.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index f44f760..18b40c9 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -56,6 +56,11 @@ def validate_mavedb_urn(urn): + """ + + :param urn: + :return: + """ if not MAVEDB_ANY_URN_RE.match(urn): raise ValidationError( "Error test" @@ -64,6 +69,11 @@ def validate_mavedb_urn(urn): def validate_mavedb_urn_experimentset(urn): + """ + + :param urn: + :return: + """ if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( "Error test" @@ -72,6 +82,11 @@ def validate_mavedb_urn_experimentset(urn): def validate_mavedb_urn_experiment(urn): + """ + + :param urn: + :return: + """ if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( "Error test" @@ -80,6 +95,11 @@ def validate_mavedb_urn_experiment(urn): def validate_mavedb_urn_scoreset(urn): + """ + + :param urn: + :return: + """ if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( "Error test" @@ -88,6 +108,11 @@ def validate_mavedb_urn_scoreset(urn): def validate_mavedb_urn_variant(urn): + """ + + :param urn: + :return: + """ if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( "Error test" From 53e2a5a15f824cd83971c5b82fc7eccc4b397666 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:11:49 -0800 Subject: [PATCH 193/877] add parameters section to docstring --- mavecore/validation/validate.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index 155bf99..aa4d578 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -3,7 +3,14 @@ def validate_all(countfile=None, scorefile=None, scorejson=None): """ - By calling other helper functions, this function runs all of the validation code + By calling other helper functions, this function runs all of the validation code. + + Parameters + __________ + countfile : + scorefile : + scorejson : + """ validate_dataset(countfile, scorefile, scorejson) From ff2e0505ac2d3b160464b348b508dcfa5eeeca14 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:11:54 -0800 Subject: [PATCH 194/877] add parameters section to docstring --- mavecore/validation/validate.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index aa4d578..b138c9a 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -20,6 +20,12 @@ def validate_dataset(countfile=None, scorefile=None, scorejson=None): This function calls all of the validation functions within mavetools/mavetools/validation/dataset_validation.py + Parameters + __________ + countfile : + scorefile : + scorejson : + Returns ------- From 9a86f7d058482079490fc26997328bdd046f531b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:28:47 -0800 Subject: [PATCH 195/877] add TODO, document errors correctly, note key error --- mavecore/validation/variant_validators/variant.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index 68e3849..bf00e71 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -9,6 +9,8 @@ def validate_columns_match(variant, scoreset) -> None: + # TODO + # document errors correctly, note key error """ Validate that a child matches parents defined columns to keep data in sync. From 67efb487a30be2c649d4f16245a910e724118540 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:15:40 -0800 Subject: [PATCH 196/877] new name --- docs/source/dataset_validators.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst index 11155cd..7ad9623 100644 --- a/docs/source/dataset_validators.rst +++ b/docs/source/dataset_validators.rst @@ -1,5 +1,5 @@ -validation -========== +dataset validation +================== validators features user-side mave dataset validators functions that replicate some of the server-side validation done in MaveDB. From b795602e6ed09abc36ae4554c5445250eb42313f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:15:57 -0800 Subject: [PATCH 197/877] need to write new description --- docs/source/dataset_validators.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst index 7ad9623..9c3609a 100644 --- a/docs/source/dataset_validators.rst +++ b/docs/source/dataset_validators.rst @@ -1,8 +1,8 @@ dataset validation ================== -validators features user-side mave dataset validators functions -that replicate some of the server-side validation done in MaveDB. +dataset validation description + .. automodule:: mavecore.validation.dataset_validators :members: From 1f0901df9ab0c7c993c063e281bacf1d8d3969f0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:16:07 -0800 Subject: [PATCH 198/877] remove one automodule --- docs/source/dataset_validators.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst index 9c3609a..dd44d0e 100644 --- a/docs/source/dataset_validators.rst +++ b/docs/source/dataset_validators.rst @@ -6,5 +6,3 @@ dataset validation description .. automodule:: mavecore.validation.dataset_validators :members: -.. automodule:: mavecore.validation.genome_validators - :members: From 87da30df1cbcf6556d382ff5115569c01fcd5da1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:16:32 -0800 Subject: [PATCH 199/877] add genome validation documentation --- docs/source/genome_validators.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/genome_validators.rst diff --git a/docs/source/genome_validators.rst b/docs/source/genome_validators.rst new file mode 100644 index 0000000..2b2618d --- /dev/null +++ b/docs/source/genome_validators.rst @@ -0,0 +1,7 @@ +genome validation +================= + +description here + +.. automodule:: mavecore.validation.genome_validators + :members: From 1e46922bb85335c9628a39110bfc982d8f7a79e9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:16:45 -0800 Subject: [PATCH 200/877] edit text --- docs/source/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index c3eded2..64b7bb9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,8 +4,8 @@ contain the root `toctree` directive. MaveCore -========= -MaveTools is a pure Python Module for bioinformatics and computational biology. +======== +MaveCore is a pure Python Module for bioinformatics and computational biology. It features all the shared functionality of MaveDB and MaveTools. Install MaveCore using pip:: From ba40cff70d3a2e59767fbd2b50f4f2694632f2f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:19:04 -0800 Subject: [PATCH 201/877] add metadata validation description --- docs/source/metadata_validators.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/metadata_validators.rst diff --git a/docs/source/metadata_validators.rst b/docs/source/metadata_validators.rst new file mode 100644 index 0000000..7d377fa --- /dev/null +++ b/docs/source/metadata_validators.rst @@ -0,0 +1,7 @@ +metadata validation +=================== + +description here + +.. automodule:: mavecore.validation.metadata_validators + :members: From 8970e86d77932aa21c35125a95f41af45164c5ce Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:19:33 -0800 Subject: [PATCH 202/877] add urn validation documentation --- docs/source/urn_validators.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/urn_validators.rst diff --git a/docs/source/urn_validators.rst b/docs/source/urn_validators.rst new file mode 100644 index 0000000..12bf6a2 --- /dev/null +++ b/docs/source/urn_validators.rst @@ -0,0 +1,7 @@ +urn validation +============== + +urn validation description + +.. automodule:: mavecore.validation.urn_validators + :members: From 9d8aa42f738ef384ff30845380de4ad948800384 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:19:57 -0800 Subject: [PATCH 203/877] edit title --- docs/source/validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 864dde5..904d63b 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -1,4 +1,4 @@ -validation +Validation ========== validators features user-side mave dataset validators functions From 449f182afb30b02b0db34663bceb26e127f56072 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:20:09 -0800 Subject: [PATCH 204/877] add description --- docs/source/validation.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 904d63b..dfb1907 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -1,5 +1,6 @@ Validation ========== +Validation features mave dataset validators functions applied in MaveTools and MaveDB. validators features user-side mave dataset validators functions that replicate some of the server-side validation done in MaveDB. From 0b69597ec8e5dbf0ba904830479f11ab76f51b8f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:20:24 -0800 Subject: [PATCH 205/877] add toctree --- docs/source/validation.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index dfb1907..8fe7a8d 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -2,8 +2,9 @@ Validation ========== Validation features mave dataset validators functions applied in MaveTools and MaveDB. -validators features user-side mave dataset validators functions -that replicate some of the server-side validation done in MaveDB. +.. toctree:: + :maxdepth: 2 + :caption: Contents: .. automodule:: mavecore.validation.dataset_validators :members: From 2b39d48d71f617cb403735cfe7e2f5efbbb3ef35 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:20:34 -0800 Subject: [PATCH 206/877] add validators --- docs/source/validation.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 8fe7a8d..21d9303 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -6,5 +6,8 @@ Validation features mave dataset validators functions applied in MaveTools and M :maxdepth: 2 :caption: Contents: -.. automodule:: mavecore.validation.dataset_validators - :members: + dataset_validators + genome_validators + metadata_validators + urn_validators + variant_validators From d735d127de3404bd41c0c04a5c175ac6c8e61eef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 13:20:50 -0800 Subject: [PATCH 207/877] add variant validators documentation --- docs/source/variant_validators.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/source/variant_validators.rst diff --git a/docs/source/variant_validators.rst b/docs/source/variant_validators.rst new file mode 100644 index 0000000..1ea012f --- /dev/null +++ b/docs/source/variant_validators.rst @@ -0,0 +1,7 @@ +variant validation +================== + +description here + +.. automodule:: mavecore.validation.variant_validators + :members: From 2d60d9784144cbea9159eaa10314e01ecbcffd6a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:45:59 -0800 Subject: [PATCH 208/877] update dataset validation description --- docs/source/dataset_validators.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dataset_validators.rst b/docs/source/dataset_validators.rst index dd44d0e..7b39877 100644 --- a/docs/source/dataset_validators.rst +++ b/docs/source/dataset_validators.rst @@ -1,7 +1,7 @@ dataset validation ================== -dataset validation description +Dataset validation contains validation code for datasets. .. automodule:: mavecore.validation.dataset_validators From 289c84e4e9b25b74c74b927922ee9548e6605294 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:46:11 -0800 Subject: [PATCH 209/877] update genome validation description --- docs/source/genome_validators.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/genome_validators.rst b/docs/source/genome_validators.rst index 2b2618d..e2745ca 100644 --- a/docs/source/genome_validators.rst +++ b/docs/source/genome_validators.rst @@ -1,7 +1,8 @@ genome validation ================= -description here +Genome validation contains validation functions relating to wild type sequences, +reference genomes, target genes, reference maps, and genomic intervals. .. automodule:: mavecore.validation.genome_validators :members: From 40cacb7707057e90478460796d6f287165906021 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:46:20 -0800 Subject: [PATCH 210/877] update metadata validation description --- docs/source/metadata_validators.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/metadata_validators.rst b/docs/source/metadata_validators.rst index 7d377fa..11422c2 100644 --- a/docs/source/metadata_validators.rst +++ b/docs/source/metadata_validators.rst @@ -1,7 +1,8 @@ metadata validation =================== -description here +Metadata validation contains functions designed to check the validity of metadata relating to +sra, keyword, pubmed, doi, ensembl, uniprot, refseq and genome identifiers and lists. .. automodule:: mavecore.validation.metadata_validators :members: From 316a90f657a99715375f24b8fd2709c655bbfbc3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:46:32 -0800 Subject: [PATCH 211/877] update urn validation description --- docs/source/urn_validators.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/urn_validators.rst b/docs/source/urn_validators.rst index 12bf6a2..26d5bb9 100644 --- a/docs/source/urn_validators.rst +++ b/docs/source/urn_validators.rst @@ -1,7 +1,7 @@ urn validation ============== -urn validation description +Urn validation validates MaveDB urn values. .. automodule:: mavecore.validation.urn_validators :members: From 69fdf88d98ac542cf62f8d98497adf190abcb8b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:46:48 -0800 Subject: [PATCH 212/877] update variant validation description --- docs/source/variant_validators.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/variant_validators.rst b/docs/source/variant_validators.rst index 1ea012f..433b064 100644 --- a/docs/source/variant_validators.rst +++ b/docs/source/variant_validators.rst @@ -1,7 +1,7 @@ variant validation ================== -description here +Variant validation contains functions to validate variants within a dataset. .. automodule:: mavecore.validation.variant_validators :members: From b146c4546945f92924cf8f5a58fe33e5ca5df471 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Mar 2022 15:46:58 -0800 Subject: [PATCH 213/877] edit text --- docs/source/validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/validation.rst b/docs/source/validation.rst index 21d9303..0b0545b 100644 --- a/docs/source/validation.rst +++ b/docs/source/validation.rst @@ -1,6 +1,6 @@ Validation ========== -Validation features mave dataset validators functions applied in MaveTools and MaveDB. +Validation features MAVE dataset validator functions applied in MaveTools and MaveDB. .. toctree:: :maxdepth: 2 From 43b4153680302ee0be7637f25b8098118c0aee5c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:40:01 -0800 Subject: [PATCH 214/877] add description to docsting --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 18b40c9..365b334 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -57,6 +57,7 @@ def validate_mavedb_urn(urn): """ + This function validates a MaveDB urn and raises an error if it is not valid. :param urn: :return: From dc4080c9ebb6c9564e9cfd06ffb155d4d6e8c11f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:40:18 -0800 Subject: [PATCH 215/877] add parameters and raises sections to docstring --- mavecore/validation/urn_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 365b334..552e54b 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -59,8 +59,15 @@ def validate_mavedb_urn(urn): """ This function validates a MaveDB urn and raises an error if it is not valid. - :param urn: - :return: + Parameters + __________ + urn : str + The MaveDB urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB urn is not valid. """ if not MAVEDB_ANY_URN_RE.match(urn): raise ValidationError( From e074ab19f86c660a8759fba47290d694aee3cf4d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:40:30 -0800 Subject: [PATCH 216/877] edit error message --- mavecore/validation/urn_validators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 552e54b..7f4be20 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -71,8 +71,7 @@ def validate_mavedb_urn(urn): """ if not MAVEDB_ANY_URN_RE.match(urn): raise ValidationError( - "Error test" - # "%(urn)s is not a valid urn.", params={"urn": urn} + "%(urn)s is not a valid urn.", params={"urn": urn} ) From 307075be85f75c1db64da17697377b5ed5e90e16 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:40:42 -0800 Subject: [PATCH 217/877] add description --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 7f4be20..ad202c4 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -77,6 +77,7 @@ def validate_mavedb_urn(urn): def validate_mavedb_urn_experimentset(urn): """ + This function validates a Experiment Set urn and raises an error if it is not valid. :param urn: :return: From 39a52279d61ca26a9c44225a542cd5f3d6b3510f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:41:13 -0800 Subject: [PATCH 218/877] add parameters and raises sections to docstring, including descriptions and variable types --- mavecore/validation/urn_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index ad202c4..146696d 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -79,8 +79,15 @@ def validate_mavedb_urn_experimentset(urn): """ This function validates a Experiment Set urn and raises an error if it is not valid. - :param urn: - :return: + Parameters + __________ + urn : str + The Experiment Set urn to be validated. + + Raises + ______ + ValidationError + If the Experiment Set urn is not valid. """ if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( From 50e82988d7660c37166a5529f9374382a6dc3fa1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:41:24 -0800 Subject: [PATCH 219/877] edit error message --- mavecore/validation/urn_validators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 146696d..073da86 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -91,8 +91,7 @@ def validate_mavedb_urn_experimentset(urn): """ if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} + "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} ) From b3cf633c3b33e4269b652b93666aa62b598a9b62 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:41:39 -0800 Subject: [PATCH 220/877] add description to docstring --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 073da86..f82ee90 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -97,6 +97,7 @@ def validate_mavedb_urn_experimentset(urn): def validate_mavedb_urn_experiment(urn): """ + This function validates an Experiment urn and raises an error if it is not valid. :param urn: :return: From 14b6e35434887f2bbecd6cb9d9dde063ee43603d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:42:16 -0800 Subject: [PATCH 221/877] add parameters and raises section to docstring, including descriptions and parameter types --- mavecore/validation/urn_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index f82ee90..b0c0ba6 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -99,8 +99,15 @@ def validate_mavedb_urn_experiment(urn): """ This function validates an Experiment urn and raises an error if it is not valid. - :param urn: - :return: + Parameters + __________ + urn : str + The Experiment urn to be validated. + + Raises + ______ + ValidationError + If the Experiemnt urn is not valid. """ if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( From 22b4e24ece2d9ce961ad9e003db96b200f8569f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:42:27 -0800 Subject: [PATCH 222/877] edit error message --- mavecore/validation/urn_validators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index b0c0ba6..9d98a88 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -111,8 +111,7 @@ def validate_mavedb_urn_experiment(urn): """ if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment urn.", params={"urn": urn} + "%(urn)s is not a valid Experiment urn.", params={"urn": urn} ) From f9c2dcd524f312372cba77023fe68c89620ca978 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:42:39 -0800 Subject: [PATCH 223/877] add docstring description --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 9d98a88..0476aeb 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -117,6 +117,7 @@ def validate_mavedb_urn_experiment(urn): def validate_mavedb_urn_scoreset(urn): """ + This function validates a Scoreset urn and raises an error if it is not valid. :param urn: :return: From a749e42c2339cf1ffcddce5e55a1ed0e8422e383 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:43:12 -0800 Subject: [PATCH 224/877] add parameters and raises sections to docstring, include descriptions and parameter types --- mavecore/validation/urn_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 0476aeb..73d3943 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -119,8 +119,15 @@ def validate_mavedb_urn_scoreset(urn): """ This function validates a Scoreset urn and raises an error if it is not valid. - :param urn: - :return: + Parameters + __________ + urn : str + The Scoreset urn to be validated + + Raises + ______ + ValidationError + If the Scoreset urn is not valid. """ if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( From aa343f558a2db33bc3a08e49f49542537ae2f21b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:43:21 -0800 Subject: [PATCH 225/877] edit error message --- mavecore/validation/urn_validators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 73d3943..a920524 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -131,8 +131,7 @@ def validate_mavedb_urn_scoreset(urn): """ if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( - "Error test" - # "%(urn)s is not a valid score set urn.", params={"urn": urn} + "%(urn)s is not a valid score set urn.", params={"urn": urn} ) From 60858ca3096bc898e6a8805ddba601a1748fa934 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:43:32 -0800 Subject: [PATCH 226/877] add description to docstring --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index a920524..4ddb234 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -137,6 +137,7 @@ def validate_mavedb_urn_scoreset(urn): def validate_mavedb_urn_variant(urn): """ + This function validates a MaveDB Variant urn and raises an error if it is not valid. :param urn: :return: From 9983ad8c24e8cb2fe567f22d18a3e1ad57f5ec8b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:43:46 -0800 Subject: [PATCH 227/877] add parameters and raises sections to docstring --- mavecore/validation/urn_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 4ddb234..e66f6af 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -139,8 +139,15 @@ def validate_mavedb_urn_variant(urn): """ This function validates a MaveDB Variant urn and raises an error if it is not valid. - :param urn: - :return: + Parameters + __________ + urn : str + The MaveDB Variant urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB Variant urn is not valid. """ if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( From fa38c1ede10d521e79b98b27fae226cf540c090c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Mar 2022 14:43:57 -0800 Subject: [PATCH 228/877] edit error message --- mavecore/validation/urn_validators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index e66f6af..22ebf15 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -151,6 +151,5 @@ def validate_mavedb_urn_variant(urn): """ if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): raise ValidationError( - "Error test" - # "%(urn)s is not a valid Variant urn.", params={"urn": urn} + "%(urn)s is not a valid Variant urn.", params={"urn": urn} ) From 73fb252481aebfdd4d5e897cc23eb6c7ee4459c1 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 15 Mar 2022 15:55:30 +1100 Subject: [PATCH 229/877] add a setup.py and .pre-commit-config.yaml, and changed a little in __init__.py file. --- .pre-commit-config.yaml | 19 +++++++++++++++++++ mavecore/__init__.py | 8 ++++++++ setup.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..aa9503f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-yaml + - id: check-json + - id: debug-statements + - id: detect-private-key + - id: end-of-file-fixer + files: \.py$ + - id: pretty-format-json + - id: trailing-whitespace + files: \.py$ +- repo: https://github.com/psf/black + rev: 21.5b1 + hooks: + - id: black + language_version: python3 + args: [--line-length=79] diff --git a/mavecore/__init__.py b/mavecore/__init__.py index e69de29..fa78694 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -0,0 +1,8 @@ +from validation.variant_validators import ( + validate_nt_variant, + validate_hgvs_string, + validate_splice_variant, + validate_pro_variant, + validate_variant_json, + validate_columns_match, +) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b223753 --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import setuptools +import sys + +with open("README.md", "r") as fh: + long_description = fh.read() + +requirements = ["fqfa>=1.2.1"] +# fqfa requires backported dataclasses in Python 3.6 +if sys.version_info.major == 3 and sys.version_info.minor == 6: + requirements.append("dataclasses") + +setuptools.setup( + name="mavecore", + version="0.1.0", + author="Daniel Esposito and Alan F Rubin", + author_email="alan.rubin@wehi.edu.au", + description=( + "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." + ), + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/VariantEffect/MaveCore/tree/main", + packages=setuptools.find_packages(), + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", + install_requires=requirements, + test_suite="tests", +) From 719b360f23e5a2b1ce6acc8f7e816336f7e2873d Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 15 Mar 2022 15:57:24 +1100 Subject: [PATCH 230/877] Change url to branch testMaveCore in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b223753..d4aeb26 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ ), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/tree/main", + url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", packages=setuptools.find_packages(), classifiers=[ "Development Status :: 3 - Alpha", From 70e63e64c270aa9b14287877259a8302f82f0422 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 15 Mar 2022 16:45:32 +1100 Subject: [PATCH 231/877] Add function to __all__ in __init__.py --- mavecore/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index fa78694..4424268 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -6,3 +6,12 @@ validate_variant_json, validate_columns_match, ) + +__all__ = [ + "validate_columns_match", + "validate_pro_variant", + "validate_variant_json", + "validate_splice_variant", + "validate_nt_variant", + "validate_hgvs_string", +] From 615e31c6334f2f71d7911c1192365d53a63eaa69 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Wed, 16 Mar 2022 10:37:50 +1100 Subject: [PATCH 232/877] changed something --- mavecore/__init__.py | 6 ------ mavecore/validation/variant_validators/hgvs.py | 5 ++++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index 4424268..dd8691e 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -1,17 +1,11 @@ from validation.variant_validators import ( - validate_nt_variant, validate_hgvs_string, - validate_splice_variant, - validate_pro_variant, validate_variant_json, validate_columns_match, ) __all__ = [ "validate_columns_match", - "validate_pro_variant", "validate_variant_json", - "validate_splice_variant", - "validate_nt_variant", "validate_hgvs_string", ] diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 6e157e9..51adc1b 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -13,6 +13,7 @@ hgvs_pro_column, ) +__all__ = ["validate_hgvs_string"] # from core.utilities import is_null def is_null(value): @@ -83,7 +84,9 @@ def validate_hgvs_string( f"protein variant prefix is 'p.'." ) else: - raise ValueError("Unknown column '{}'. Expected nt, splice or p".format(column)) + raise ValueError( + "Unknown column '{}'. Expected nt, splice or p".format(column) + ) return str(variant) From c148bc2dee691632d41652f981fcb0da18c65ffd Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Wed, 16 Mar 2022 11:07:36 +1100 Subject: [PATCH 233/877] changed something --- mavecore/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index dd8691e..54c1bf6 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -1,4 +1,4 @@ -from validation.variant_validators import ( +from mavecore.validation.variant_validators import ( validate_hgvs_string, validate_variant_json, validate_columns_match, From 02001852d6bce6d299499c42e1ed9b4710b392ae Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 15 Mar 2022 17:43:13 -0700 Subject: [PATCH 234/877] add .travis.yml file --- .travis.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..86c1a8c --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +language: python +matrix: + include: + - python: "3.6" + - python: "3.7" + - python: "3.8" + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true + allow_failures: + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true +install: + - pip3 install . +before_script: + - pip3 install coverage + - pip3 install coveralls + - if ! $NO_MYPY; then pip3 install mypy; fi +script: + - coverage run --source MaveCore -m unittest + - if ! $NO_MYPY; then mypy MaveCore tests; fi +after_success: + - coveralls \ No newline at end of file From 385e1ad9911872722b7c21a71fc016176ef2e41e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 15 Mar 2022 18:54:01 -0700 Subject: [PATCH 235/877] add to init file, for module import --- mavecore/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index e69de29..ecbf958 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -0,0 +1,11 @@ +from mavecore.validation.variant_validators import ( + validate_hgvs_string, + validate_variant_json, + validate_columns_match, +) + +__all__ = [ + "validate_columns_match", + "validate_variant_json", + "validate_hgvs_string", +] \ No newline at end of file From b9049442f7abe0edf0a98d94cca9b634878a737c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 15 Mar 2022 18:54:25 -0700 Subject: [PATCH 236/877] add setup.py file --- setup.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ac68b6b --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import setuptools +import sys + +with open("README.md", "r") as fh: + long_description = fh.read() + +requirements = ["fqfa>=1.2.1"] +# fqfa requires backported dataclasses in Python 3.6 +if sys.version_info.major == 3 and sys.version_info.minor == 6: + requirements.append("dataclasses") + +setuptools.setup( + name="mavecore", + version="0.1.0", + author="Daniel Esposito and Alan F Rubin", + author_email="alan.rubin@wehi.edu.au", + description=( + "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." + ), + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/VariantEffect/MaveCore/tree/add_validation", + packages=setuptools.find_packages(), + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", + install_requires=requirements, + test_suite="tests", +) \ No newline at end of file From b8a9d7b8cdc2c713188ea30b2be190f5e910f9a0 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Wed, 16 Mar 2022 17:38:53 +1100 Subject: [PATCH 237/877] changed something --- .travis.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..1ba065f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +language: python +matrix: + include: + - python: "3.6" + - python: "3.7" + - python: "3.8" + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true + allow_failures: + - python: "3.9" + - python: "3.9-dev" + - python: "3.10-dev" + - python: "pypy3" + env: NO_MYPY=true +install: + - pip3 install . +before_script: + - pip3 install coverage + - pip3 install coveralls + - if ! $NO_MYPY; then pip3 install mypy; fi +script: + - coverage run --source mavecore -m unittest + - if ! $NO_MYPY; then mypy mavecore tests; fi +after_success: + - coveralls \ No newline at end of file From ce4d4bab846aec8b38cd67b513f82d96708796a7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 21 Mar 2022 18:36:43 -0700 Subject: [PATCH 238/877] duplicate directory and contents for testing purposes --- mavecore/original_validation/__init__.py | 0 mavecore/original_validation/constants.py | 90 ++ .../original_validation/dataset_validators.py | 374 ++++++ mavecore/original_validation/exceptions.py | 2 + .../original_validation/genome_validators.py | 601 ++++++++++ .../metadata_validators.py | 203 ++++ .../original_validation/urn_validators.py | 155 +++ mavecore/original_validation/validate.py | 69 ++ .../variant_validators/__init__.py | 25 + .../variant_validators/dataset.py | 1019 +++++++++++++++++ .../variant_validators/hgvs.py | 134 +++ .../variant_validators/variant.py | 85 ++ 12 files changed, 2757 insertions(+) create mode 100644 mavecore/original_validation/__init__.py create mode 100644 mavecore/original_validation/constants.py create mode 100644 mavecore/original_validation/dataset_validators.py create mode 100644 mavecore/original_validation/exceptions.py create mode 100644 mavecore/original_validation/genome_validators.py create mode 100644 mavecore/original_validation/metadata_validators.py create mode 100644 mavecore/original_validation/urn_validators.py create mode 100644 mavecore/original_validation/validate.py create mode 100644 mavecore/original_validation/variant_validators/__init__.py create mode 100644 mavecore/original_validation/variant_validators/dataset.py create mode 100644 mavecore/original_validation/variant_validators/hgvs.py create mode 100644 mavecore/original_validation/variant_validators/variant.py diff --git a/mavecore/original_validation/__init__.py b/mavecore/original_validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/original_validation/constants.py b/mavecore/original_validation/constants.py new file mode 100644 index 0000000..f10aab8 --- /dev/null +++ b/mavecore/original_validation/constants.py @@ -0,0 +1,90 @@ +import re + +""" +Null Constant definitions +""" +NA_value = "NA" +null_values_list = ( + "nan", + "na", + "none", + "", + "undefined", + "n/a", + "null", + "nil", + NA_value, +) + +null_values_re = re.compile( + r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_value), flags=re.IGNORECASE +) + +readable_null_values = [ + "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() +] + ["whitespace"] + +""" +Sequence constants +""" +AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" +DNA_LETTERS = "ATCG" + +DNA_SEQ_PATTERN = fr"[{DNA_LETTERS}]+" +AA_SEQ_PATTERN = fr"[{AA_LETTERS}]+" + + +""" +Constant definitions for application `experiment`. +""" +from mavecore.validation.urn_validators import ( + MAVEDB_EXPERIMENTSET_URN_PATTERN, + MAVEDB_EXPERIMENT_URN_PATTERN, + MAVEDB_SCORESET_URN_PATTERN, + MAVEDB_TMP_URN_PATTERN, +) + +hgvs_nt_column = "hgvs_nt" +hgvs_splice_column = "hgvs_splice" +hgvs_pro_column = "hgvs_pro" +hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) +meta_data = "meta_data" +score_columns = "score_columns" +count_columns = "count_columns" +variant_score_data = "score_data" +variant_count_data = "count_data" +required_score_column = "score" + +experimentset_url_pattern = "|".join( + [MAVEDB_EXPERIMENTSET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] +) +experiment_url_pattern = "|".join( + [MAVEDB_EXPERIMENT_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] +) +scoreset_url_pattern = "|".join( + [MAVEDB_SCORESET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] +) + +any_url_pattern = "|".join( + [experimentset_url_pattern, experiment_url_pattern, scoreset_url_pattern] +) + + +valid_dataset_columns = [score_columns, count_columns] +valid_variant_columns = [variant_score_data, variant_count_data] + +variant_to_scoreset_column = { + variant_score_data: score_columns, + variant_count_data: count_columns, +} +scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} + +# Celery dataset status +processing = "processing" +failed = "failed" +success = "success" + +# User roles +administrator = "administrator" +editor = "editor" +viewer = "viewer" diff --git a/mavecore/original_validation/dataset_validators.py b/mavecore/original_validation/dataset_validators.py new file mode 100644 index 0000000..7a0b669 --- /dev/null +++ b/mavecore/original_validation/dataset_validators.py @@ -0,0 +1,374 @@ +import io +import csv +import re + +from numpy.testing import assert_array_equal + +from mavecore.validation import constants + + +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : str + The value to be checked as null or not. + + Returns + _______ + bool + True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. + """ + value = str(value).strip().lower() + return constants.null_values_re.fullmatch(value) or not value + + +class WordLimitValidator: + """ + This class + + Attributes + __________ + message : str + Message template to describe how many words a field is limited to. + code : str + + counter : str + + """ + message = "This field is limited to {} words." + code = "invalid" + counter = re.compile(r"\w+\b", flags=re.IGNORECASE) + + def __init__(self, word_limit, message=None, code=None): + # TODO + # check the code parameter type + """ + This constructor sets the values of the WordLimitValidator class attributes + message, code, and counter. + + Parameters + __________ + word_limit : int + The word limit assigned to the word limit attribute. + message : str + (default = None) The message assigned to the message attribute. + code : + (default = None) The code assigned to the code attribute. + """ + if message is not None: + self.message = message + if code is not None: + self.code = code + self.word_limit = int(word_limit) + + def __call__(self, value): + """ + Parameters + __________ + value : + + Returns + _______ + + Raises + ______ + ValueError + If + """ + if not value: + return + if len(self.counter.findall(value)) > self.word_limit: + raise ValueError(self.message.format(self.word_limit)) + + +def read_header_from_io(file, label=None, msg=None): + # TODO + # confirm types for parameters + """ + This takes a file and reads the header from that file. + + Parameters + __________ + file : + label : str + (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. + + Returns + _______ + str + The header that was read from io. + + Raises + ______ + ValueError + If a header could not be parsed from file. Columns must be coma delimited. Column names + with commas must be escaped by enclosing them in double quotes. + """ + if label is None: + label = "uploaded" + + try: + header_line = file.readline() + if isinstance(header_line, bytes): + header_line = header_line.decode() + file.seek(0) + f = io.StringIO(header_line.strip()) + return [h.strip() for h in csv.DictReader(f, delimiter=",").fieldnames] + except Exception: + if not msg: + msg = ( + "A header could not be parsed from your {} file. Make sure" + "Columns are comma delimited. Column names with commas must be" + "escaped by enclosing them in double quotes.".format(label) + ) + raise ValueError(msg) + + +def validate_has_hgvs_in_header(header, label=None, msg=None): + """ + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValueError + If + """ + if label is None: + label = "Uploaded" + params = {} + if msg is None: + msg = ( + "Your %(label)s file must define either a nucleotide hgvs column " + "'%(col_nt)s' or a protein hgvs column '%(col_p)s'. " + "Columns are case-sensitive and must be comma delimited." + ) + params = { + "label": label, + "col_nt": constants.hgvs_nt_column, + "col_p": constants.hgvs_pro_column, + } + if not set(header) & set(constants.hgvs_columns): + raise ValueError(msg) + + +def validate_at_least_one_additional_column(header, label=None, msg=None): + # TODO + # verify parameter types + """ + This function checks the passed header to see if there exists additional columns besides the three + specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. + + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValueError + If there are not additional columns in the header argument. + """ + if label is None: + label = "Uploaded" + params = {} + if not any(v not in constants.hgvs_columns for v in header): + if msg is None: + msg = ( + "Your %(label)s file must define at " + "least one additional column different " + "from '{}', '{}' and '{}'.".format( + constants.hgvs_nt_column, + constants.hgvs_splice_column, + constants.hgvs_pro_column, + ) + ) + params = {"label": label} + raise ValueError(msg) + + +def validate_header_contains_no_null_columns(header, label=None, msg=None): + """ + This function checks that the header parameter does not contain any null columns that + are not in the case-insensitive null values listed in constants.readable_null_values. + + Parameters + __________ + header : + label : + (default = None) + msg : + (default = None) + + Raises + ______ + ValueError + If the file header contains blank/empty/whitespace. Only columns or the + case-insensitive null values listed in constants.readable_null_values + are permitted. + """ + if label is None: + label = "File" + any_null = any([is_null(v) for v in header]) + if any_null: + if msg is None: + msg = ( + "%(label)s file header cannot contain blank/empty/whitespace " + "only columns or the following case-insensitive null " + "values: {}.".format(label, ", ".join(constants.readable_null_values)) + ) + raise ValueError(msg) + + +def validate_datasets_define_same_variants(scores, counts): + """ + Checks if two `pd.DataFrame` objects parsed from uploaded files + define the same variants. + + Parameters + ---------- + scores : `pd.DataFrame` + Scores dataframe parsed from an uploaded scores file. + counts : `pd.DataFrame` + Scores dataframe parsed from an uploaded counts file. + + Raises + ______ + ValueError + If score and counts files do not define the same variants. + """ + try: + assert_array_equal( + scores[constants.hgvs_nt_column].sort_values().values, + counts[constants.hgvs_nt_column].sort_values().values, + ) + assert_array_equal( + scores[constants.hgvs_splice_column].sort_values().values, + counts[constants.hgvs_splice_column].sort_values().values, + ) + assert_array_equal( + scores[constants.hgvs_pro_column].sort_values().values, + counts[constants.hgvs_pro_column].sort_values().values, + ) + except AssertionError: + raise ValueError( + "Your score and counts files do not define the same variants. " + "Check that the hgvs columns in both files match." + ) + + +def validate_scoreset_score_data_input(file): + """ + Validator function for checking that the scores file input contains + at least the column 'hgvs' and 'score'. Returns the file to position 0 + after reading the header (first line). + + Parameters + ---------- + file : :class:`io.FileIO` + An open file handle in read mode. + + Raises + ______ + ValueError + If score data file is missing the required column constants.required_score_column + """ + file.seek(0) + header = read_header_from_io(file, label="Score") + validate_header_contains_no_null_columns(header, label="Score") + validate_has_hgvs_in_header(header, label="Score") + validate_at_least_one_additional_column(header, label="Score") + + if constants.required_score_column not in header: + raise ValueError( + "Score data file is missing the required column " + + constants.required_score_column + + "." + + "Columns are case-sensitive and must be comma delimited." + ) + + +def validate_scoreset_count_data_input(file): + """ + Validator function for checking that the counts file input contains + at least the column 'hgvs'. Returns the file to position 0 + after reading the header (first line). + + Parameters + ---------- + file : :class:`io.FileIO` + File parsed by a `django` form. + """ + file.seek(0) + header = read_header_from_io(file, label="Count") + validate_header_contains_no_null_columns(header, label="Count") + validate_has_hgvs_in_header(header, label="Count") + validate_at_least_one_additional_column(header, label="Count") + + +def validate_scoreset_json(dict_): + """ + Checks a given dictionary to ensure that it is suitable to be used + as the `dataset_columns` attribute in a :class:`ScoreSet` instance. + + Parameters + ---------- + dict_ : dict + Dictionary of keys mapping to a list. + + Raises + ______ + ValueError + If scoreset data is missing the required key. + ValueError + If header values are not strings. + ValueError + If + ValueError + If missing required column constants.required_score_column for score dataset. + ValueError + If encountered unexpected keys extras. + """ + required_columns = [constants.score_columns, constants.count_columns] + + for key in required_columns: + if key not in dict_.keys(): + raise ValueError("Scoreset data is missing the required key " + key) + + columns = dict_[key] + if not all([isinstance(c, str) for c in columns]): + raise ValueError("Header values must be strings.") + + if not isinstance(columns, list): + type_ = type(columns).__name__ + raise ValueError( + "Value for " + key.replace("_", " ") + " must be a list not " + type_ + ) + + # Check score columns is not-empty and at least contains hgvs and score + if key == constants.score_columns: + if constants.required_score_column not in columns: + raise ValueError( + "Missing required column constants.required_score_column " + "for score dataset." + ) + + # Check there are not unexpected columns supplied to the scoreset json + # field. + extras = [k for k in dict_.keys() if k not in set(required_columns)] + if len(extras) > 0: + extras = [k for k in dict_.keys() if k not in required_columns] + raise ValueError("Encountered unexpected keys extras") diff --git a/mavecore/original_validation/exceptions.py b/mavecore/original_validation/exceptions.py new file mode 100644 index 0000000..2851fa7 --- /dev/null +++ b/mavecore/original_validation/exceptions.py @@ -0,0 +1,2 @@ +class ValidationError(ValueError): + pass diff --git a/mavecore/original_validation/genome_validators.py b/mavecore/original_validation/genome_validators.py new file mode 100644 index 0000000..ba4845d --- /dev/null +++ b/mavecore/original_validation/genome_validators.py @@ -0,0 +1,601 @@ +""" +Validator functions for the fields of the following classes: + WildTypeSequence + ReferenceGenome + TargetGene + ReferenceMap + GenomicInterval + +Most validation should validate one specific field, unless fields need +to be validated against each other. +""" +from fqfa.validator.validator import dna_bases_validator, amino_acids_validator +from mavecore.validation.exceptions import ValidationError + +from mavecore.validation import constants + + +def is_null(value): + """ + This function checks if the value exists or is null. + + Parameters + __________ + value : + The value to be checked. + + Returns + _______ + bool + True if a stripped/lowercase value in `nan_col_values`. + """ + value = str(value).strip().lower() + return constants.null_values_re.fullmatch(value) or not value + + +# min_start_validator = MinValueValidator( +# 1, message=_("Start coordinate must be a positive integer.") +# ) +# min_end_validator = MinValueValidator( +# 1, message=_("End coordinate must be a positive integer.") +# ) + + +class WildTypeSequence: + """ + Basic model specifying a wild-type sequence. + + Parameters + ---------- + sequence : `models.CharField` + The wild type DNA sequence that is related to the `target`. Will + be converted to upper-case upon instantiation. + + sequence_type : `models.CharField` + Protein sequence (amino acids) or DNA (nucleotides) + """ + + class SequenceType: + """ + + """ + DNA = "dna" + PROTEIN = "protein" + INFER = "infer" + + @classmethod + def detect_sequence_type(cls, sequence): + # TODO + # confirm sequence parameter type + """ + This function determines if the sequence is a DNA or protein sequence and + returns "dna" if it is DNA or "protein" if it is protein. An error is raised + if it is neither. + + Parameters + __________ + sequence : str + + Returns + _______ + str + "dna" or "protein" depending on if the sequence is a DNA or protein sequence. + + Raises + ______ + ValueError + If sequence parameter is not protein or DNA. + """ + if sequence_is_dna(sequence): + return cls.DNA + elif sequence_is_protein(sequence): + return cls.PROTEIN + else: + raise ValueError( + f"Unknown sequence '{sequence}'. It is not protein or DNA." + ) + + @classmethod + def is_protein(cls, value): + """ + + Parameters + __________ + value : + + Returns + _______ + + """ + return value == cls.PROTEIN + + @classmethod + def is_dna(cls, value): + """ + + Parameters + __________ + value : + + Returns + _______ + + """ + return value == cls.DNA + + @classmethod + def choices(cls): + """ + + Returns + _______ + """ + return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] + + class Meta: + """ + + """ + verbose_name = "Reference sequence" + verbose_name_plural = "Reference sequences" + + def __str__(self): + """ + + Returns + _______ + + """ + return self.get_sequence() + + # sequence = models.TextField( + # default=None, + # blank=False, + # null=False, + # verbose_name="Reference sequence", + # validation=[validate_wildtype_sequence], + # ) + # sequence_type = models.CharField( + # blank=True, + # null=False, + # default=SequenceType.INFER, + # verbose_name="Reference sequence type", + # max_length=32, + # choices=SequenceType.choices(), + # ) + + @property + def is_dna(self): + """ + + Returns + _______ + + """ + return self.__class__.SequenceType.is_dna(self.sequence_type) + + @property + def is_protein(self): + """ + + Returns + _______ + + """ + return self.__class__.SequenceType.is_protein(self.sequence_type) + + def save(self, *args, **kwargs): + """ + + Parameters + __________ + args : + kwargs : + + Returns + _______ + + """ + if self.sequence is not None: + self.sequence = self.sequence.upper() + self.sequence_type = ( + (self.__class__.SequenceType.detect_sequence_type(self.sequence)) + if self.__class__.SequenceType.INFER + else self.sequence_type + ) + + return super().save(*args, **kwargs) + + def get_sequence(self): + """ + + Returns + _______ + + """ + return self.sequence.upper() + + def is_attached(self): + """ + + Returns + _______ + + """ + return getattr(self, "target", None) is not None + + +# GenomicInterval +# ------------------------------------------------------------------------- # +def validate_interval_start_lteq_end(start, end): + """ + This function validates whether or not an interval's starting coordinate is less than + or equal to that interval's ending coordinate. + + Parameters + __________ + start : int + The interval's starting coordinate. + end : int + The interval's ending coordinate. + + Returns + _______ + None + If start is NoneType or end is NoneType. + + Raises + ______ + ValidationError + If an interval's starting coordinate is greater than the ending coordinate. + """ + # Intervals may be underspecified, but will be ignored so skip validation. + if start is None or end is None: + return + if start > end: + raise ValidationError( + ( + "An interval's starting coordinate cannot be greater than the " + "ending coordinate." + ) + ) + + +def validate_strand(value): + # TODO + # find the type of value + """ + This function validates a GenomicInterval strand and raises an error if the strand is invalid. + + Parameters + __________ + value : + The Genomic Interval strand to be validated. + + Raises + ______ + ValidationError + If GenomicInterval strand is not positive or negative. + """ + if value not in ("+", "-"): + raise ValidationError("GenomicInterval strand must be either '+' or '-'") + + +def validate_chromosome(value): + # TODO + # add description and type for value parameter + """ + + Parameters + __________ + value : + + Returns + _______ + None + If value is NoneType. + + Raises + ______ + ValidationError + If chromosome identifier is null. + """ + # Intervals may be underspecified, but will be ignored so skip validation. + if value is None: + return + if is_null(value): + raise ValidationError("Chromosome identifier must not be null.") + + +def validate_unique_intervals(intervals): + # TODO + # add description and interval parameter type plus description + """ + + Parameters + __________ + intervals : + + Raises + ______ + ValidationError + If the same interval was specified twice. + """ + for interval1 in intervals: + for interval2 in intervals: + if ( + (interval1.pk is not None) + and (interval2.pk is not None) + and (interval1.pk == interval2.pk) + ): + continue + elif interval1 is interval2: + continue + elif interval1.equals(interval2): + raise ValidationError("You can not specify the same interval twice.") + + +# WildTypeSequence +# ------------------------------------------------------------------------- # +def validate_wildtype_sequence(seq, as_type="any"): + # TODO + # add description to as_type parameter + """ + This function checks whether or not seq is a wildtype sequence. + + Parameters + __________ + seq : str + The sequence being validated. + as_type : str + (default = "any") + + Raises + ______ + ValidationError + If seq is not a valid wild type sequence. + ValidationError + If seq is not a valid DNA or protein reference sequence. + """ + # from .models import WildTypeSequence + + # Explicitly check for these cases as they are also valid AA sequences. + if is_null(seq): + raise ValidationError( + "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} + ) + + seq = seq.upper() + is_dna = dna_bases_validator(seq) is not None + is_aa = amino_acids_validator(seq) is not None + + if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: + raise ValidationError( + "'%(seq)s' is not a valid DNA reference sequence." # , + # params={"seq": seq}, + ) + elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: + raise ValidationError( + "'%(seq)s' is not a valid protein reference sequence." # , + # params={"seq": seq}, + ) + elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( + is_dna or is_aa + ): + raise ValidationError( + "'%(seq)s' is not a valid DNA or protein reference sequence." # , + # params={"seq": seq}, + ) + + +def sequence_is_dna(seq): + """ + This function checks if seq is a DNA sequence. + + Parameters + __________ + seq : str + The sequence to be validated. + + Returns + _______ + bool + True if the dna_bases_validator returns a match object. + """ + # Explicitly check for these cases as they are also valid AA sequences. + if is_null(seq): + return False + seq = seq.upper() + return dna_bases_validator(seq) is not None + + +def sequence_is_protein(seq): + """ + This function check if seq is a protein sequence. + + Parameters + __________ + seq : str + The sequence being validated. + + Returns + _______ + bool + True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. + """ + # Explicitly check for these cases as they are also valid AA sequences. + if is_null(seq): + return False + seq = seq.upper() + if dna_bases_validator(seq) is not None: + return False # Very likely a DNA sequence if only ATG + return amino_acids_validator(seq) is not None + + +# ReferenceGenome +# ------------------------------------------------------------------------- # +def validate_organism_name(organism_name): + # TODO + # confirm organism_name type + """ + This function validates the organism name by checking that the name is not null. + + Parameters + __________ + organism_name : str + The organism name to be validated. + + Raises + ______ + ValidationError + If the organism name is null. + """ + if is_null(organism_name): + raise ValidationError("Species name must not be null.") + + +def validate_reference_genome_has_one_external_identifier(referencegenome): + # TODO + # revise description, make sure it is accurate + # anything greater than 0 will return True, so should it be == 1 or > 0? + # determine what type referencegenome is + """ + This function validates whether or not the reference genome has one external identifier. + An error is raised if + + Parameters + __________ + referencegenome : + + Raises + ______ + ValidationError + If + """ + if not referencegenome.genome_id: + raise ValidationError( + "Only one external identifier can be specified for a reference" "genome." + ) + + +def validate_genome_short_name(value): + # TODO + # confirm the type of the value parameter + """ + This function validates the genome short name and raises an error if the value is null. + + Parameters + __________ + value : str + The genome short name to be validated. + + Raises + ______ + ValidationError + If the genome short name is null. + """ + if is_null(value): + raise ValidationError("Genome short name must not be null.") + + +# ReferenceMap +# ------------------------------------------------------------------------- # +def validate_map_has_unique_reference_genome(annotations): + # TODO + # check the type of annotations + # add description to annotations parameter + """ + This function validates whether or not each map in annotations has a + unique reference genome and raises an error if this is not the case. + + Parameters + __________ + annotations : + + Raises + ______ + ValidationError + If each reference map does not specify a different reference genome. + """ + genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) + if len(genomes) < len(annotations): + raise ValidationError("Each reference map must specify a different reference genome.") + + +def validate_map_has_at_least_one_interval(reference_map): + """ + This function validates that a reference map has at least one interval and raises an error + if this is not the case. + + Parameters + __________ + reference_map : + Reference map. + + Raises + ______ + ValidationError + If the reference_map does not have at least one interval. + """ + if not reference_map.get_intervals().count(): + raise ValidationError("You must specify at least one interval for each reference map.") + + +def validate_at_least_one_map(reference_maps): + """ + This function validates whether a target has at least one reference map specified + and raises an error if it does not. + + Parameters + __________ + reference_maps : + + + Raises + ______ + ValidationError + If the target does not have at least one reference map specified. + """ + if not len(reference_maps): + raise ValidationError("A target must have at least one reference map specified.") + + +def validate_one_primary_map(reference_maps): + """ + This function validates the existence of one primary reference map and raises an error + if it does not exist. + + Parameters + __________ + reference_maps : + + Raises + ______ + ValidationError + If target has less than or more than one primary reference map. + """ + primary_count = sum(a.is_primary_reference_map() for a in reference_maps) + if primary_count > 1 or primary_count < 1: + raise ValidationError("A target must have one primary reference map.") + + +# TargetGene +# ------------------------------------------------------------------------- # +def validate_gene_name(gene_name): + # TODO + # confirm gene_name type + """ + This function checks to see if a gene name is null and raises and error if it is. + + Parameters + __________ + gene_name : str + The gene name. + + Raises + ______ + ValidationError + If gene name (value parameter) is null. + """ + if is_null(gene_name): + raise ValidationError("Gene name must not be null.") diff --git a/mavecore/original_validation/metadata_validators.py b/mavecore/original_validation/metadata_validators.py new file mode 100644 index 0000000..3c9d5d1 --- /dev/null +++ b/mavecore/original_validation/metadata_validators.py @@ -0,0 +1,203 @@ +import idutils + +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants import null_values_re + + +def is_null(value): + # TODO + # check that parameter type is accurate + """ + This function checks that the passed value is null. + + Parameters + __________ + value : str + Value to be checked if null. + + Returns + _______ + bool + True if a stripped/lowercase value in in `nan_col_values`. + """ + value = str(value).strip().lower() + return null_values_re.fullmatch(value) or not value + + +def validate_sra_identifier(identifier): + if not ( + idutils.is_sra(identifier) + or idutils.is_bioproject(identifier) + or idutils.is_geo(identifier) + or idutils.is_arrayexpress_array(identifier) + or idutils.is_arrayexpress_experiment(identifier) + ): + raise ValidationError( + f"'{identifier} is not a valid SRA, GEO, ArrayExpress or BioProject " + "accession." + ) + + +def validate_keyword(kw): + """ + This function validates whether or not the kw parameter is valid by + checking that it is a string that is not null. If kw is null + or is not a string, an error is raised. + + Parameters + __________ + kw : str + The keyword to be validated. + + Raises + ______ + ValidationError + If the kw argument is not a valid string. + """ + if is_null(kw) or not isinstance(kw, str): + raise ValidationError( + f"'{kw}' not a valid keyword. Keywords must be valid strings." + ) + + +def validate_pubmed_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_pmid(identifier): + raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") + + +def validate_doi_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_doi(identifier): + raise ValidationError(f"'{identifier}' is not a valid DOI.") + + +def validate_ensembl_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_ensembl(identifier): + raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") + + +def validate_uniprot_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_uniprot(identifier): + raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") + + +def validate_refseq_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_refseq(identifier): + raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") + + +def validate_genome_identifier(identifier): + """ + + :param identifier: + :return: + """ + if not idutils.is_genome(identifier): + raise ValidationError( + f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." + ) + + +def validate_keyword_list(values): + """ + This function takes a list of keyword values and validates that each one is valid. + A valid keyword is a non-null string. The validate_keyword function will raise an + ValidationError if any of the keywords are invalid. + + Parameters + __________ + values : list[str] + The list of values to be validated. + """ + for value in values: + if not is_null(value): + validate_keyword(value) + + +def validate_pubmed_list(values): + """ + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_pubmed_identifier(value) + + +def validate_sra_list(values): + """ + + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_sra_identifier(value) + + +def validate_doi_list(values): + """ + + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_doi_identifier(value) + + +def validate_ensembl_list(values): + """ + + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_ensembl_identifier(value) + + +def validate_refseq_list(values): + """ + + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_refseq_identifier(value) + + +def validate_uniprot_list(values): + """ + + :param values: + :return: + """ + for value in values: + if not is_null(value): + validate_uniprot_identifier(value) diff --git a/mavecore/original_validation/urn_validators.py b/mavecore/original_validation/urn_validators.py new file mode 100644 index 0000000..22ebf15 --- /dev/null +++ b/mavecore/original_validation/urn_validators.py @@ -0,0 +1,155 @@ +import re +from mavecore.validation.exceptions import ValidationError + +MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 +MAVEDB_TMP_URN_DIGITS = 16 +MAVEDB_URN_MAX_LENGTH = 64 +MAVEDB_URN_NAMESPACE = "mavedb" + + +# Temp URN patterns +# --------------------------------------------------------------------------- # +MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( + width=MAVEDB_TMP_URN_DIGITS +) +MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) + + +# Experimentset Pattern/Compiled RE +MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( + namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS +) +MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) + +# Experiment Pattern/Compiled RE +MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( + pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] +) +MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) + +# Scoreset Pattern/Compiled RE +MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( + pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] +) +MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) + +# Variant Pattern/Compiled RE +MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( + pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] +) +MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) + +# Any Pattern/Compiled RE +MAVEDB_ANY_URN_PATTERN = "|".join( + [ + r"({pattern})".format(pattern=p) + for p in ( + MAVEDB_EXPERIMENTSET_URN_PATTERN, + MAVEDB_EXPERIMENT_URN_PATTERN, + MAVEDB_SCORESET_URN_PATTERN, + MAVEDB_VARIANT_URN_PATTERN, + MAVEDB_TMP_URN_PATTERN, + ) + ] +) +MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) + + +def validate_mavedb_urn(urn): + """ + This function validates a MaveDB urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB urn is not valid. + """ + if not MAVEDB_ANY_URN_RE.match(urn): + raise ValidationError( + "%(urn)s is not a valid urn.", params={"urn": urn} + ) + + +def validate_mavedb_urn_experimentset(urn): + """ + This function validates a Experiment Set urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment Set urn to be validated. + + Raises + ______ + ValidationError + If the Experiment Set urn is not valid. + """ + if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} + ) + + +def validate_mavedb_urn_experiment(urn): + """ + This function validates an Experiment urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment urn to be validated. + + Raises + ______ + ValidationError + If the Experiemnt urn is not valid. + """ + if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "%(urn)s is not a valid Experiment urn.", params={"urn": urn} + ) + + +def validate_mavedb_urn_scoreset(urn): + """ + This function validates a Scoreset urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Scoreset urn to be validated + + Raises + ______ + ValidationError + If the Scoreset urn is not valid. + """ + if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "%(urn)s is not a valid score set urn.", params={"urn": urn} + ) + + +def validate_mavedb_urn_variant(urn): + """ + This function validates a MaveDB Variant urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB Variant urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB Variant urn is not valid. + """ + if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "%(urn)s is not a valid Variant urn.", params={"urn": urn} + ) diff --git a/mavecore/original_validation/validate.py b/mavecore/original_validation/validate.py new file mode 100644 index 0000000..b138c9a --- /dev/null +++ b/mavecore/original_validation/validate.py @@ -0,0 +1,69 @@ +from mavecore.validation import dataset_validators + + +def validate_all(countfile=None, scorefile=None, scorejson=None): + """ + By calling other helper functions, this function runs all of the validation code. + + Parameters + __________ + countfile : + scorefile : + scorejson : + + """ + validate_dataset(countfile, scorefile, scorejson) + + +def validate_dataset(countfile=None, scorefile=None, scorejson=None): + """ + This function calls all of the validation functions within + mavetools/mavetools/validation/dataset_validation.py + + Parameters + __________ + countfile : + scorefile : + scorejson : + + Returns + ------- + + """ + + # how to incorporate word limit validator? + + if scorefile is not None: + # open scorefile + open(scorefile) + # this one returns header + scoreheader = dataset_validators.read_header_from_io(file=scorefile) + + # if the header was returned, do these ones + dataset_validators.validate_has_hgvs_in_header(header=scoreheader) + dataset_validators.validate_at_least_one_additional_column(header=scoreheader) + dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) + + dataset_validators.validate_scoreset_score_data_input(file=scorefile) + + if scorejson is not None: + # open scorejson + open(scorejson) + dataset_validators.validate_scoreset_json(dict_=scorejson) + + if countfile is not None: + # open countfile + open(countfile) + countheader = dataset_validators.read_header_from_io(file=countfile) + + # if the header was returned, do these ones + dataset_validators.validate_has_hgvs_in_header(header=countheader) + dataset_validators.validate_at_least_one_additional_column(header=countheader) + dataset_validators.validate_header_contains_no_null_columns(header=countheader) + + dataset_validators.validate_scoreset_count_data_input(file=countfile) + + if scorefile is not None and countfile is not None: + dataset_validators.validate_datasets_define_same_variants( + scores=scorefile, counts=countfile + ) diff --git a/mavecore/original_validation/variant_validators/__init__.py b/mavecore/original_validation/variant_validators/__init__.py new file mode 100644 index 0000000..1f7aca1 --- /dev/null +++ b/mavecore/original_validation/variant_validators/__init__.py @@ -0,0 +1,25 @@ +from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset + +from .hgvs import ( + validate_nt_variant, + validate_pro_variant, + validate_splice_variant, + validate_hgvs_string, +) + +from .variant import validate_columns_match, validate_variant_json + +__all__ = [ + "dataset", + "variant", + "hgvs", + "validate_nt_variant", + "validate_splice_variant", + "validate_pro_variant", + "validate_hgvs_string", + "validate_columns_match", + "validate_variant_json", + "MaveCountsDataset", + "MaveScoresDataset", + "MaveDataset", +] diff --git a/mavecore/original_validation/variant_validators/dataset.py b/mavecore/original_validation/variant_validators/dataset.py new file mode 100644 index 0000000..4dacdad --- /dev/null +++ b/mavecore/original_validation/variant_validators/dataset.py @@ -0,0 +1,1019 @@ +import re +from collections import defaultdict +from io import StringIO +from itertools import groupby +from operator import itemgetter +from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict + +import pandas as pd +import numpy as np +from mavehgvs import MaveHgvsParseError, Variant +from fqfa.util.translate import translate_dna +from fqfa.util.infer import infer_sequence_type + +from mavecore.validation.constants import ( + hgvs_nt_column, + hgvs_splice_column, + hgvs_pro_column, + required_score_column, + null_values_list, + null_values_re, + readable_null_values +) + + +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : + + Returns + _______ + bool + + """ + value = str(value).strip().lower() + return null_values_re.fullmatch(value) or not value + + +class MaveDataset: + """ + + """ + class DatasetType: + """ + + """ + SCORES = "scores" + COUNTS = "counts" + + class HGVSColumns: + """ + + """ + NUCLEOTIDE: str = hgvs_nt_column + TRANSCRIPT: str = hgvs_splice_column + PROTEIN: str = hgvs_pro_column + + @classmethod + def options(cls) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] + + class AdditionalColumns: + """ + + """ + @classmethod + def options(cls) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return [] + + # ---------------------- Construction------------------------------------ # + @classmethod + def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + + Returns + _______ + `MaveScoresDataset` + + """ + return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) + + @classmethod + def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + + Returns + _______ + `MaveCountsDataset` + """ + return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) + + @classmethod + def _for_type( + cls, file: Union[str, TextIO, BinaryIO], dataset_type: str + ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: + """ + + Parameters + __________ + file : Union[str, TextIO, BinaryIO] + dataset_type : str + + Returns + _______ + Union[`MaveScoreDataset`, `MaveCountsDataset`] + + Raises + ______ + TypeError + If file parameter is not expected file path or buffer object. + ValueError + If dataset_type parameter is not a recognized dataset type. + """ + if isinstance(file, str): + handle = file + elif hasattr(file, "read"): + file_contents = file.read() + if hasattr(file_contents, "decode"): + file_contents = file_contents.decode("utf-8") + file_contents = file_contents.strip() + handle = StringIO(file_contents) + else: + raise TypeError( + f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" + ) + + extra_na_values = set( + list(null_values_list) + + [str(x).lower() for x in null_values_list] + + [str(x).upper() for x in null_values_list] + + [str(x).capitalize() for x in null_values_list] + ) + + df = pd.read_csv( + filepath_or_buffer=handle, + sep=",", + encoding="utf-8", + quotechar='"', + comment="#", + na_values=extra_na_values, + keep_default_na=True, + dtype={ + **{c: str for c in cls.HGVSColumns.options()}, + MaveScoresDataset.AdditionalColumns.SCORES: float, + }, + ).replace(null_values_re, np.NaN) + + if dataset_type == cls.DatasetType.SCORES: + return MaveScoresDataset(df) + elif dataset_type == cls.DatasetType.COUNTS: + return MaveCountsDataset(df) + else: + raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") + + # ---------------------- Public ----------------------------------------- # + @property + def label(self) -> str: + """ + + Returns + _______ + str + """ + return "dataset" + + @property + def is_valid(self) -> Optional[bool]: + """ + + Returns + _______ + Optional[bool] + """ + if self._errors is None: + return None + return len(self._errors) == 0 + + @property + def n_errors(self) -> Optional[int]: + """ + + Returns + _______ + Optional[int] + """ + if self._errors is None: + return None + return len(self._errors) + + @property + def errors(self) -> Optional[List[str]]: + """ + + Returns + _______ + Optional[List[str]] + """ + return self._errors + + @property + def is_empty(self) -> bool: + """ + + Returns + _______ + bool + """ + return self._df.empty + + @property + def columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return list(self._df.columns) + + @property + def hgvs_columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return [c for c in self.columns if c in self.HGVSColumns.options()] + + @property + def non_hgvs_columns(self) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return [c for c in self.columns if c not in self.HGVSColumns.options()] + + @property + def n_rows(self) -> int: + """ + + Returns + _______ + int + """ + return len(self._df) + + @property + def n_columns(self) -> int: + """ + + Returns + _______ + int + """ + return len(self.columns) + + @property + def index_column(self) -> Optional[str]: + """ + + Returns + _______ + Optional[str] + """ + if self._errors: + return None + return self._index_column + + @property + def index(self) -> Optional[pd.Index]: + """ + + Returns + _______ + Optional[`pd.Index`] + """ + if self._errors: + return None + return self._df.index.copy(deep=True) + + def data(self, serializable=False) -> pd.DataFrame: + """ + Return underlying dataframe object. + + Parameters + ---------- + serializable: bool + Replaces `np.NaN` with `None` for JSON compatibility. + + Returns + _______ + `pd.DataFrame` + + """ + if serializable: + # need to force "object" type to allow None values + return_df = self._df.astype(object, copy=True) + return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) + return return_df + return self._df.copy(deep=True) + + def match_other(self, other: "MaveDataset") -> Optional[bool]: + """ + Check that each dataset defined the same variants in each column. + + Parameters + ---------- + other: MaveDataset + Validator instance to match against. + + Returns + ------- + Optional[bool] + A boolean indicating index match, otherwise `None` if either instance + is not valid. + """ + if (not self.is_valid) or (not other.is_valid): + return None + + if self.index_column != other.index_column: + return False + + return all( + self._df[column].equals(other._df[column]) + for column in self.HGVSColumns.options() + ) + + def to_dict(self) -> Dict[str, Dict]: + """ + Returns underlying dataframe as dictionary in 'records' orientation. + Keys will be index values and values will be an inner dictionary mapping + column names to row values for said index. + + Returns + _______ + Dict[str, Dict] + """ + # Convert np.NaN values to None for consistency across all columns and + # for compatibility in PostgresSQL queries. Replaces all values which + # are considered null by pandas with None by masking pd.notnull cells. + + return self.data(serializable=True).to_dict(orient="index") + + def validate( + self, + targetseq: Optional[str] = None, + relaxed_ordering: bool = False, + allow_index_duplicates: bool = False, + ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + allow_index_duplicates : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + + self._errors = [] + self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) + self._index_column = None + + self._validate_columns() + # Only attempt to validate variants if columns are valid + if not self._errors: + ( + self._normalize_data() + ._validate_genomic_variants(targetseq, relaxed_ordering) + ._validate_transcript_variants(targetseq, relaxed_ordering) + ._validate_protein_variants(targetseq, relaxed_ordering) + ._validate_index_column(allow_duplicates=allow_index_duplicates) + ) + + if self.is_empty: + self._errors.append( + f"No variants could be parsed from your {self.label} file. " + f"Please upload a non-empty file." + ) + return self + + if not self._errors: + # Set index last as original index is used when indicating duplicate + # hgvs string row numbers in the column name used as the index ( + # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). + self._df.index = pd.Index(self._df[self.index_column]) + + return self + + # ---------------------- Private ---------------------------------------- # + def __init__( + self, + df: Optional[pd.DataFrame] = None, + index_column: Optional[str] = None, + errors: Optional[List[str]] = None, + ): + """ + + Parameters + df : + index_column : + errors : + + Raises + ______ + + """ + self._df: pd.DataFrame = pd.DataFrame() if df is None else df + self._index_column = index_column or None + self._errors = None if errors is None else list(errors) + + def __repr__(self): + """ + + Returns + _______ + + """ + return ( + f"<" + f"{self.__class__.__name__} " + f"columns={self.columns} " + f"index={self.index_column} " + f"valid={self.is_valid}" + f">" + ) + + @property + def _column_order(self) -> Dict[str, int]: + """ + + Returns + _______ + Dict[str, int] + """ + return defaultdict( + lambda: 100, + { + self.HGVSColumns.NUCLEOTIDE: 0, + self.HGVSColumns.TRANSCRIPT: 1, + self.HGVSColumns.PROTEIN: 2, + **{ + c: (2 + i) + for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) + }, + }, + ) + + def _validate_columns(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + if self._errors: + return self + + # Pandas will automatically name blank columns using the pattern below + unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) + columns = self.columns + if any(is_null(h) or unnamed.match(h) for h in columns): + self._errors.append( + f"Column names in your {self.label} file cannot values " + f"considered null such as the following: " + f"{', '.join(readable_null_values)}" + ) + + columns = [c for c in columns if not is_null(c)] + if len(columns) < 1: + self._errors.append( + f"No columns could not be parsed from your {self.label} file. " + "Make sure columns are comma delimited. Column names with " + "commas must be escaped by enclosing them in double quotes" + ) + + required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} + if not (set(columns) & required): + self._errors.append( + f"Your {self.label} file must define either a nucleotide " + f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " + f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " + f"Columns are case-sensitive and must be comma delimited" + ) + + if not (set(columns) - set(self.HGVSColumns.options())): + self._errors.append( + f"Your {self.label} file must define at least one additional " + f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " + f"'{self.HGVSColumns.TRANSCRIPT}' and " + f"'{self.HGVSColumns.PROTEIN}'" + ) + + return self + + def _normalize_data(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + """ + if self._errors: + return self + + # Initialize missing hgvs columns as empty. + for c in self.HGVSColumns.options(): + if c not in self.columns: + self._df[c] = np.NaN + + column_order = self._column_order + sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) + + self._df = self._df[sorted_columns] + return self + + def _validate_genomic_variants( + self, targetseq: Optional[str] = None, relaxed_ordering: bool = False + ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): + return self + + defines_transcript_variants = not self._column_is_null( + self.HGVSColumns.TRANSCRIPT + ) + validated_variants, prefixes, errors = self._validate_variants( + column=self.HGVSColumns.NUCLEOTIDE, + splice_defined=defines_transcript_variants, + targetseq=targetseq, + relaxed_ordering=relaxed_ordering, + ) + + if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: + self._errors.append( + f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " + f"(prefix 'g.') cannot be mixed with transcript variants " + f"(prefix 'c.' or 'n.')" + ) + + if prefixes == {"g"} and not defines_transcript_variants: + self._errors.append( + f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " + f"are required when specifying genomic variants " + f"(prefix 'g.' in the 'hgvs_nt' column)" + ) + + self._errors += errors + + if not self._errors: + self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants + + self._index_column = self.HGVSColumns.NUCLEOTIDE + return self + + def _validate_transcript_variants( + self, targetseq: Optional[str] = None, relaxed_ordering: bool = False + ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) + defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) + + if defines_tx and (not defines_nt): + self._errors.append( + f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " + f"must be defined when specifying transcript " + f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" + ) + + if not defines_tx: + return self + + # Don't validate transcript variants against sequence. Might come + # back to this later with research into implementing gene models. + validated_variants, _, errors = self._validate_variants( + column=self.HGVSColumns.TRANSCRIPT, + targetseq=None, + relaxed_ordering=relaxed_ordering, + ) + + self._errors += errors + + if not self._errors: + self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants + + return self + + def _validate_protein_variants( + self, targetseq: Optional[str] = None, relaxed_ordering: bool = False + ) -> "MaveDataset": + """ + + Parameters + __________ + targetseq : + relaxed_ordering : + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + if self._column_is_null(self.HGVSColumns.PROTEIN): + return self + + defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) + defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) + + if defines_splice: + protein_seq = None + else: + protein_seq = targetseq + if targetseq and "dna" in infer_sequence_type(targetseq).lower(): + protein_seq, remainder = translate_dna(targetseq) + if remainder: + self._errors.insert( + 0, + "Protein variants could not be validated because the " + "length of your target sequence is not a multiple of 3", + ) + + validated_variants, _, errors = self._validate_variants( + column=self.HGVSColumns.PROTEIN, + targetseq=protein_seq, + relaxed_ordering=relaxed_ordering, + ) + + self._errors += errors + + if not self._errors: + self._df[self.HGVSColumns.PROTEIN] = validated_variants + + if not defines_nt: + self._index_column = self.HGVSColumns.PROTEIN + + return self + + def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": + """ + + Parameters + __________ + allow_duplicates : bool + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + if self._errors: + return self + + if self._index_column is None: + self._index_column = self.HGVSColumns.NUCLEOTIDE + + if self._column_is_partially_null(self._index_column): + self._errors.append( + f"Primary column (inferred as '{self._index_column}') " + f"cannot contain any null values from " + f"{', '.join(readable_null_values)} (case-insensitive)" + ) + + if not allow_duplicates: + dupes = self._df[self._index_column].duplicated(keep=False) + if np.any(dupes): + dup_list = zip( + self._df.loc[dupes, self._index_column], dupes.index[dupes] + ) + dupes_str = ", ".join( + f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers + for (v, groups) in groupby(dup_list, key=itemgetter(0)) + ) + self._errors.append( + f"Primary column (inferred as '{self._index_column}') " + f"contains duplicate HGVS variants: {dupes_str}" + ) + + return self + + def _validate_variants( + self, + column: str, + splice_defined: Optional[bool] = None, + targetseq: Optional[str] = None, + relaxed_ordering: bool = False, + ) -> Tuple[pd.Series, Set[str], List[str]]: + """ + + Parameters + __________ + column : str + splice_defined : Optional[bool] + targetseq : Optional[str] + relaxed_ordering : bool + + Returns + _______ + Tuple[`pd.Series`, Set[str], List[str]] + + Raises + ______ + + """ + + prefixes = set() + errors = [] + + def validate_variant(variant: str): + # TODO: logic mirrors that in validate_hgvs_string, which is kept + # as a standalone function for backwards compatibility with + # django's model validator field. Merge at some point. + + if is_null(variant): + return np.NaN + else: + try: + if variant.lower() == "_sy": + errors.append( + "'_sy' is no longer supported and should be " + "replaced by 'p.(=)'" + ) + return variant + elif variant.lower() == "_wt": + errors.append( + "'_wt' is no longer supported and should be " + "replaced by one of 'g.=', 'c.=' or 'n.='" + ) + return variant + + validated = Variant( + variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering + ) + prefix = validated.prefix.lower() + prefixes.add(prefix) + + prefix_error = self._validate_variant_prefix_for_column( + variant=validated, + prefix=validated.prefix, + column=column, + splice_defined=splice_defined, + ) + if prefix_error: + errors.append(prefix_error) + + return str(validated) + + except MaveHgvsParseError as error: + errors.append(f"{variant}: {str(error)}") + return np.NaN + + validated_variants = self._df[column].apply(validate_variant) + + return validated_variants, prefixes, errors + + def _column_is_null(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ + return len(self._df[self._df[column].isna()]) == len(self._df) + + def _column_is_partially_null(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ + return 0 < len(self._df[self._df[column].isna()]) < len(self._df) + + def _column_is_fully_specified(self, column) -> bool: + """ + + Parameters + __________ + column : + + Returns + _______ + bool + """ + return len(self._df[self._df[column].isna()]) == 0 + + def _validate_variant_prefix_for_column( + self, variant: Variant, prefix: str, column: str, splice_defined: bool + ) -> Optional[str]: + """ + + Parameters + __________ + variant : Variant + prefix : str + column : str + splice_defined : bool + + Returns + _______ + Optional[str] + + Raises + ______ + ValueError + If there is an unknown column as column argument. + """ + prefix = prefix.lower() + + if column == self.HGVSColumns.NUCLEOTIDE: + if splice_defined: + if prefix not in "g": + return ( + f"{column}: " + f"'{variant}' is not a genomic variant " + f"(prefix 'g.'). Nucleotide variants must " + f"be genomic if transcript variants are " + f"also present" + ) + else: + if prefix not in "cn": + return ( + f"{column}: " + f"'{variant}' is not a transcript variant. " + f"The accepted transcript variant prefixes " + f"are 'c.' or 'n.'" + ) + elif column == self.HGVSColumns.TRANSCRIPT: + if prefix not in "cn": + return ( + f"{column}: " + f"'{variant}' is not a transcript variant. The " + f"accepted transcript variant prefixes are " + f"'c.' or 'n.'" + ) + elif column == self.HGVSColumns.PROTEIN: + if prefix not in "p": + return ( + f"{column}: " + f"'{variant}' is not a protein variant. " + f"The accepted protein variant prefix is 'p.'" + ) + else: + raise ValueError( + f"Unknown column '{column}'. Expected one " + f"of {', '.join(self.HGVSColumns.options())}" + ) + + return None + + +class MaveScoresDataset(MaveDataset): + """ + + """ + class AdditionalColumns: + """ + + """ + SCORES = required_score_column + + @classmethod + def options(cls) -> List[str]: + """ + + Returns + _______ + List[str] + """ + return [cls.SCORES] + + @property + def label(self) -> str: + """ + + Returns + _______ + str + """ + return "scores" + + def _validate_columns(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + + """ + super()._validate_columns() + + if self.AdditionalColumns.SCORES not in self.columns: + self._errors.append( + f"Your scores dataset is missing the " + f"'{self.AdditionalColumns.SCORES}' column. " + f"Columns are case-sensitive and must be comma delimited" + ) + + return self + + def _normalize_data(self) -> "MaveDataset": + """ + + Returns + _______ + `MaveDataset` + + Raises + ______ + ValueError + + """ + super()._normalize_data() + + should_be_numeric = [self.AdditionalColumns.SCORES] + for c in should_be_numeric: + if c in self.columns: + try: + self._df[c] = self._df[c].astype(dtype=float, errors="raise") + except ValueError as e: + self._errors.append(f"{c}: {str(e)}") + + return self + + +class MaveCountsDataset(MaveDataset): + """ + + """ + @property + def label(self) -> str: + """ + + Returns + _______ + str + """ + return "counts" diff --git a/mavecore/original_validation/variant_validators/hgvs.py b/mavecore/original_validation/variant_validators/hgvs.py new file mode 100644 index 0000000..2a57a39 --- /dev/null +++ b/mavecore/original_validation/variant_validators/hgvs.py @@ -0,0 +1,134 @@ +from functools import partial +from typing import Optional, Union + +from mavehgvs import Variant, MaveHgvsParseError +from mavecore.validation.exceptions import ValidationError + +from mavecore.validation.constants import NA_value, null_values_re + +from mavecore.validation.constants import ( + hgvs_nt_column, + hgvs_splice_column, + hgvs_pro_column, +) + + +# from core.utilities import is_null +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value + + Returns + _______ + + """ + value = str(value).strip().lower() + return null_values_re.fullmatch(value) or not value + + +def validate_hgvs_string( + value: Union[str, bytes], + column: Optional[str] = None, + splice_present: bool = False, + targetseq: Optional[str] = None, + relaxed_ordering: bool = False, +) -> Optional[str]: + """ + + Parameters + __________ + value : Union[str, bytes] + column : Optional[str] = None + splice_present : + targetseq : + relaxed_ordering : + + Returns + _______ + + Raises + ______ + ValidationError + If variant HGVS input values are not strings. + ValidationError + If value is _sy or _wt, which are no longer supported. + ValidationError + If + ValidationError + If value is not a genomic variant (prefix 'g.'). Nucleotide variants must + be genomic if transcript variants are also defined. + ValidationError + If value is not a transcript variant. The accepted transcript variant + prefixes are 'c.', 'n.'. + ValidationError + If value is not a protein variant. The accepted protein variant prefix is 'p.'. + ValueError + If there exists an unknown column. Function expects nt, splice or p." + """ + if is_null(value): + return None + + if hasattr(value, "decode"): + value = value.decode() + if not isinstance(value, str): + raise ValidationError( + "Variant HGVS values input must be strings. " + "'{}' has the type '{}'.".format(value, type(value).__name__) + ) + + if value.lower() == "_sy": + raise ValidationError( + "_sy is no longer supported and should be replaced by p.(=)" + ) + elif value.lower() == "_wt": + raise ValidationError( + "_wt is no longer supported and should be replaced by (cgnp).=" + ) + + try: + variant = Variant( + s=value, targetseq=targetseq, relaxed_ordering=relaxed_ordering + ) + except MaveHgvsParseError as error: + raise ValidationError(f"{value}: {str(error)}") + + prefix = variant.prefix.lower() + if column in ("nt", hgvs_nt_column): + if splice_present: + if prefix not in "g": + raise ValidationError( + f"'{value}' is not a genomic variant (prefix 'g.'). " + f"Nucleotide variants must be genomic if transcript " + f"variants are also defined." + ) + else: + if prefix not in "cn": + raise ValidationError( + f"'{value}' is not a transcript variant. The accepted " + f"transcript variant prefixes are 'c.', 'n.'." + ) + elif column in ("splice", hgvs_splice_column): + if prefix not in "cn": + raise ValidationError( + f"'{value}' is not a transcript variant. The accepted " + f"transcript variant prefixes are 'c.', 'n.'." + ) + elif column in ("p", hgvs_pro_column): + if prefix not in "p": + raise ValidationError( + f"'{value}' is not a protein variant. The accepted " + f"protein variant prefix is 'p.'." + ) + else: + raise ValueError("Unknown column '{}'. Expected nt, splice or p".format(column)) + + return str(variant) + + +validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) +validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) +validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) diff --git a/mavecore/original_validation/variant_validators/variant.py b/mavecore/original_validation/variant_validators/variant.py new file mode 100644 index 0000000..bf00e71 --- /dev/null +++ b/mavecore/original_validation/variant_validators/variant.py @@ -0,0 +1,85 @@ +from typing import Dict + +from mavecore.validation.constants import ( + variant_score_data, + variant_count_data, + required_score_column, +) +from mavecore.validation.exceptions import ValidationError + + +def validate_columns_match(variant, scoreset) -> None: + # TODO + # document errors correctly, note key error + """ + Validate that a child matches parents defined columns to keep + data in sync. + + Parameters + __________ + variant : + scoreset : + + Raises + ______ + ValidationError + If variant score columns do not match scoreset score columns. + ValidationError + If variant count columns do not match scoreset count columns. + """ + try: + if variant.score_columns != scoreset.score_columns: + raise ValidationError( + f"Variant defines score columns '{variant.score_columns}' " + f"but parent defines columns '{scoreset.score_columns}. " + ) + if variant.count_columns != scoreset.count_columns: + raise ValidationError( + f"Variant defines count columns '{variant.count_columns}' " + f"but parent defines columns '{scoreset.count_columns}. " + ) + except KeyError as error: + raise ValidationError(f"Missing key {str(error)}") + + +def validate_variant_json(data: Dict[str, Dict]) -> None: + """ + Checks a given dictionary to ensure that it is suitable to be used + as the `data` attribute in a :class:`Variant` instance. + + Parameters + ---------- + data : dict[str, dict] + Dictionary of keys mapping to a list. + + Raises + ______ + ValidationError + If missing the required key. + ValidationError + If missing the required column in variant's score data. + ValidationError + If encountered unexpected keys. + ValidationError + If value for key is not of type dict. + """ + expected_keys = [variant_score_data, variant_count_data] + for key in expected_keys: + if key not in data.keys(): + raise ValidationError(f"Missing the required key {key}") + + if required_score_column not in data[variant_score_data]: + raise ValidationError( + f"Missing required column '{required_score_column}' in variant's score data." + ) + + extras = [k for k in data.keys() if k not in set(expected_keys)] + if len(extras) > 0: + extras = [k for k in data.keys() if k not in expected_keys] + raise ValidationError("Encountered unexpected keys {extras}") + + # Check the correct data types are given. + for key in expected_keys: + if not isinstance(data[key], dict): + type_ = type(data[key]).__name__ + raise ValidationError(f"Value for '{key}' must be a dict not {type_}.") From ef3dbfefc4a1a90d305ea2145f99087e4b96524f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 22 Mar 2022 15:42:45 -0700 Subject: [PATCH 239/877] add TODO --- .../validation/variant_validators/dataset.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 404ecd2..4880f52 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -23,6 +23,7 @@ def is_null(value): + # TODO """ Returns True if a stripped/lowercase value in in `nan_col_values`. @@ -39,10 +40,12 @@ def is_null(value): class MaveDataset: + # TODO """ """ class DatasetType: + # TODO """ """ @@ -50,6 +53,7 @@ class DatasetType: COUNTS = "counts" class HGVSColumns: + # TODO """ """ @@ -59,6 +63,7 @@ class HGVSColumns: @classmethod def options(cls) -> List[str]: + # TODO """ Returns @@ -68,11 +73,13 @@ def options(cls) -> List[str]: return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: + # TODO """ """ @classmethod def options(cls) -> List[str]: + # TODO """ Returns @@ -84,6 +91,7 @@ def options(cls) -> List[str]: # ---------------------- Construction------------------------------------ # @classmethod def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": + # TODO """ Parameters @@ -99,6 +107,7 @@ def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": @classmethod def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": + # TODO """ Parameters @@ -115,6 +124,7 @@ def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": def _for_type( cls, file: Union[str, TextIO, BinaryIO], dataset_type: str ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: + # TODO """ Parameters @@ -177,6 +187,7 @@ def _for_type( # ---------------------- Public ----------------------------------------- # @property def label(self) -> str: + # TODO """ Returns @@ -187,6 +198,7 @@ def label(self) -> str: @property def is_valid(self) -> Optional[bool]: + # TODO """ Returns @@ -199,6 +211,7 @@ def is_valid(self) -> Optional[bool]: @property def n_errors(self) -> Optional[int]: + # TODO """ Returns @@ -211,6 +224,7 @@ def n_errors(self) -> Optional[int]: @property def errors(self) -> Optional[List[str]]: + # TODO """ Returns @@ -221,6 +235,7 @@ def errors(self) -> Optional[List[str]]: @property def is_empty(self) -> bool: + # TODO """ Returns @@ -231,6 +246,7 @@ def is_empty(self) -> bool: @property def columns(self) -> List[str]: + # TODO """ Returns @@ -241,6 +257,7 @@ def columns(self) -> List[str]: @property def hgvs_columns(self) -> List[str]: + # TODO """ Returns @@ -251,6 +268,7 @@ def hgvs_columns(self) -> List[str]: @property def non_hgvs_columns(self) -> List[str]: + # TODO """ Returns @@ -261,6 +279,7 @@ def non_hgvs_columns(self) -> List[str]: @property def n_rows(self) -> int: + # TODO """ Returns @@ -271,6 +290,7 @@ def n_rows(self) -> int: @property def n_columns(self) -> int: + # TODO """ Returns @@ -281,6 +301,7 @@ def n_columns(self) -> int: @property def index_column(self) -> Optional[str]: + # TODO """ Returns @@ -293,6 +314,7 @@ def index_column(self) -> Optional[str]: @property def index(self) -> Optional[pd.Index]: + # TODO """ Returns @@ -304,6 +326,7 @@ def index(self) -> Optional[pd.Index]: return self._df.index.copy(deep=True) def data(self, serializable=False) -> pd.DataFrame: + # TODO """ Return underlying dataframe object. @@ -325,6 +348,7 @@ def data(self, serializable=False) -> pd.DataFrame: return self._df.copy(deep=True) def match_other(self, other: "MaveDataset") -> Optional[bool]: + # TODO """ Check that each dataset defined the same variants in each column. @@ -351,6 +375,7 @@ def match_other(self, other: "MaveDataset") -> Optional[bool]: ) def to_dict(self) -> Dict[str, Dict]: + # TODO """ Returns underlying dataframe as dictionary in 'records' orientation. Keys will be index values and values will be an inner dictionary mapping @@ -372,6 +397,7 @@ def validate( relaxed_ordering: bool = False, allow_index_duplicates: bool = False, ) -> "MaveDataset": + # TODO """ Parameters @@ -426,6 +452,7 @@ def __init__( index_column: Optional[str] = None, errors: Optional[List[str]] = None, ): + # TODO """ Parameters @@ -442,6 +469,7 @@ def __init__( self._errors = None if errors is None else list(errors) def __repr__(self): + # TODO """ Returns @@ -459,6 +487,7 @@ def __repr__(self): @property def _column_order(self) -> Dict[str, int]: + # TODO """ Returns @@ -479,6 +508,7 @@ def _column_order(self) -> Dict[str, int]: ) def _validate_columns(self) -> "MaveDataset": + # TODO """ Returns @@ -530,6 +560,7 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": + # TODO """ Returns @@ -553,6 +584,7 @@ def _normalize_data(self) -> "MaveDataset": def _validate_genomic_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + # TODO """ Parameters @@ -606,6 +638,7 @@ def _validate_genomic_variants( def _validate_transcript_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + # TODO """ Parameters @@ -652,6 +685,7 @@ def _validate_transcript_variants( def _validate_protein_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": + # TODO """ Parameters @@ -703,6 +737,7 @@ def _validate_protein_variants( return self def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": + # TODO """ Parameters @@ -754,6 +789,7 @@ def _validate_variants( targetseq: Optional[str] = None, relaxed_ordering: bool = False, ) -> Tuple[pd.Series, Set[str], List[str]]: + # TODO """ Parameters @@ -776,6 +812,7 @@ def _validate_variants( errors = [] def validate_variant(variant: str): + # TODO # TODO: logic mirrors that in validate_hgvs_string, which is kept # as a standalone function for backwards compatibility with # django's model validator field. Merge at some point. @@ -823,6 +860,7 @@ def validate_variant(variant: str): return validated_variants, prefixes, errors def _column_is_null(self, column) -> bool: + # TODO """ Parameters @@ -836,6 +874,7 @@ def _column_is_null(self, column) -> bool: return len(self._df[self._df[column].isna()]) == len(self._df) def _column_is_partially_null(self, column) -> bool: + # TODO """ Parameters @@ -849,6 +888,7 @@ def _column_is_partially_null(self, column) -> bool: return 0 < len(self._df[self._df[column].isna()]) < len(self._df) def _column_is_fully_specified(self, column) -> bool: + # TODO """ Parameters @@ -864,6 +904,7 @@ def _column_is_fully_specified(self, column) -> bool: def _validate_variant_prefix_for_column( self, variant: Variant, prefix: str, column: str, splice_defined: bool ) -> Optional[str]: + # TODO """ Parameters @@ -927,10 +968,12 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): + # TODO """ """ class AdditionalColumns: + # TODO """ """ @@ -938,6 +981,7 @@ class AdditionalColumns: @classmethod def options(cls) -> List[str]: + # TODO """ Returns @@ -948,6 +992,7 @@ def options(cls) -> List[str]: @property def label(self) -> str: + # TODO """ Returns @@ -957,6 +1002,7 @@ def label(self) -> str: return "scores" def _validate_columns(self) -> "MaveDataset": + # TODO """ Returns @@ -979,6 +1025,7 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": + # TODO """ Returns @@ -1004,11 +1051,13 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): + # TODO """ """ @property def label(self) -> str: + # TODO """ Returns From bc019f4a7a48b4d59499c007de31e02e5a81d742 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 24 Mar 2022 09:56:04 -0700 Subject: [PATCH 240/877] add requirements file --- requirements.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1424f39 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pandas~=1.4.1 +mavehgvs~=0.4.0 +numpy~=1.22.2 +fqfa~=1.2.1 +IDUtils~=1.1.12 +setuptools~=60.9.3 \ No newline at end of file From 49f50d2cd4b5dd8756705a22ae02df55e4f7039b Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:26:40 +1100 Subject: [PATCH 241/877] update requirements --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index ac68b6b..5eb7191 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,13 @@ import setuptools -import sys with open("README.md", "r") as fh: long_description = fh.read() -requirements = ["fqfa>=1.2.1"] -# fqfa requires backported dataclasses in Python 3.6 -if sys.version_info.major == 3 and sys.version_info.minor == 6: - requirements.append("dataclasses") +requirements = ["fqfa>=1.2.1", + "mavehgvs>=0.4.0", + "idutils>=1.1.0", + "pandas>=1.1.0", + ] setuptools.setup( name="mavecore", From dce6a7b059aa370f2d8f9987ce1f9cebc22ec801 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:26:53 +1100 Subject: [PATCH 242/877] update setup metadata --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 5eb7191..335703f 100644 --- a/setup.py +++ b/setup.py @@ -12,17 +12,17 @@ setuptools.setup( name="mavecore", version="0.1.0", - author="Daniel Esposito and Alan F Rubin", + author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", description=( - "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." + "MaveCore implements shared functionality for MaveTools and MaveDB." ), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/tree/add_validation", + url="https://github.com/VariantEffect/MaveCore/", packages=setuptools.find_packages(), classifiers=[ - "Development Status :: 3 - Alpha", + "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Bio-Informatics", "License :: OSI Approved :: BSD License", From 9d4b0c3bcc6da870e85551b49345e33b4cf2711d Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:28:14 +1100 Subject: [PATCH 243/877] add pycharm files to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b6e4761..bd6ad26 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea/ From 12cb6075b54246bf78b561ee39db1c8bfef5a9f3 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:56:26 +1100 Subject: [PATCH 244/877] black formatting --- mavecore/__init__.py | 2 +- mavecore/original_validation/constants.py | 4 +-- .../original_validation/dataset_validators.py | 1 + .../original_validation/genome_validators.py | 18 ++++++++----- .../original_validation/urn_validators.py | 4 +-- .../variant_validators/dataset.py | 23 ++++++---------- mavecore/validation/constants.py | 4 +-- mavecore/validation/dataset_validators.py | 1 + mavecore/validation/genome_validators.py | 18 ++++++++----- mavecore/validation/urn_validators.py | 4 +-- .../validation/variant_validators/dataset.py | 26 ++++++------------- setup.py | 17 ++++++------ .../test_dataset_validators.py | 4 ++- .../test_validators.py | 2 +- 14 files changed, 59 insertions(+), 69 deletions(-) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index ecbf958..54c1bf6 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -8,4 +8,4 @@ "validate_columns_match", "validate_variant_json", "validate_hgvs_string", -] \ No newline at end of file +] diff --git a/mavecore/original_validation/constants.py b/mavecore/original_validation/constants.py index f10aab8..6630323 100644 --- a/mavecore/original_validation/constants.py +++ b/mavecore/original_validation/constants.py @@ -30,8 +30,8 @@ AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" DNA_LETTERS = "ATCG" -DNA_SEQ_PATTERN = fr"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = fr"[{AA_LETTERS}]+" +DNA_SEQ_PATTERN = rf"[{DNA_LETTERS}]+" +AA_SEQ_PATTERN = rf"[{AA_LETTERS}]+" """ diff --git a/mavecore/original_validation/dataset_validators.py b/mavecore/original_validation/dataset_validators.py index 7a0b669..3e5c464 100644 --- a/mavecore/original_validation/dataset_validators.py +++ b/mavecore/original_validation/dataset_validators.py @@ -38,6 +38,7 @@ class WordLimitValidator: counter : str """ + message = "This field is limited to {} words." code = "invalid" counter = re.compile(r"\w+\b", flags=re.IGNORECASE) diff --git a/mavecore/original_validation/genome_validators.py b/mavecore/original_validation/genome_validators.py index ba4845d..dff8b69 100644 --- a/mavecore/original_validation/genome_validators.py +++ b/mavecore/original_validation/genome_validators.py @@ -56,9 +56,8 @@ class WildTypeSequence: """ class SequenceType: - """ + """ """ - """ DNA = "dna" PROTEIN = "protein" INFER = "infer" @@ -133,9 +132,8 @@ def choices(cls): return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] class Meta: - """ + """ """ - """ verbose_name = "Reference sequence" verbose_name_plural = "Reference sequences" @@ -519,7 +517,9 @@ def validate_map_has_unique_reference_genome(annotations): """ genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) if len(genomes) < len(annotations): - raise ValidationError("Each reference map must specify a different reference genome.") + raise ValidationError( + "Each reference map must specify a different reference genome." + ) def validate_map_has_at_least_one_interval(reference_map): @@ -538,7 +538,9 @@ def validate_map_has_at_least_one_interval(reference_map): If the reference_map does not have at least one interval. """ if not reference_map.get_intervals().count(): - raise ValidationError("You must specify at least one interval for each reference map.") + raise ValidationError( + "You must specify at least one interval for each reference map." + ) def validate_at_least_one_map(reference_maps): @@ -557,7 +559,9 @@ def validate_at_least_one_map(reference_maps): If the target does not have at least one reference map specified. """ if not len(reference_maps): - raise ValidationError("A target must have at least one reference map specified.") + raise ValidationError( + "A target must have at least one reference map specified." + ) def validate_one_primary_map(reference_maps): diff --git a/mavecore/original_validation/urn_validators.py b/mavecore/original_validation/urn_validators.py index 22ebf15..f81b8fd 100644 --- a/mavecore/original_validation/urn_validators.py +++ b/mavecore/original_validation/urn_validators.py @@ -70,9 +70,7 @@ def validate_mavedb_urn(urn): If the MaveDB urn is not valid. """ if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError( - "%(urn)s is not a valid urn.", params={"urn": urn} - ) + raise ValidationError("%(urn)s is not a valid urn.", params={"urn": urn}) def validate_mavedb_urn_experimentset(urn): diff --git a/mavecore/original_validation/variant_validators/dataset.py b/mavecore/original_validation/variant_validators/dataset.py index 4dacdad..3b67cd5 100644 --- a/mavecore/original_validation/variant_validators/dataset.py +++ b/mavecore/original_validation/variant_validators/dataset.py @@ -18,7 +18,7 @@ required_score_column, null_values_list, null_values_re, - readable_null_values + readable_null_values, ) @@ -40,20 +40,17 @@ def is_null(value): class MaveDataset: - """ + """ """ - """ class DatasetType: - """ + """ """ - """ SCORES = "scores" COUNTS = "counts" class HGVSColumns: - """ + """ """ - """ NUCLEOTIDE: str = hgvs_nt_column TRANSCRIPT: str = hgvs_splice_column PROTEIN: str = hgvs_pro_column @@ -69,9 +66,8 @@ def options(cls) -> List[str]: return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: - """ + """ """ - """ @classmethod def options(cls) -> List[str]: """ @@ -928,13 +924,11 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): - """ + """ """ - """ class AdditionalColumns: - """ + """ """ - """ SCORES = required_score_column @classmethod @@ -1005,9 +999,8 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): - """ + """ """ - """ @property def label(self) -> str: """ diff --git a/mavecore/validation/constants.py b/mavecore/validation/constants.py index f10aab8..6630323 100644 --- a/mavecore/validation/constants.py +++ b/mavecore/validation/constants.py @@ -30,8 +30,8 @@ AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" DNA_LETTERS = "ATCG" -DNA_SEQ_PATTERN = fr"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = fr"[{AA_LETTERS}]+" +DNA_SEQ_PATTERN = rf"[{DNA_LETTERS}]+" +AA_SEQ_PATTERN = rf"[{AA_LETTERS}]+" """ diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 03a3d6c..b8585f4 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -30,6 +30,7 @@ class WordLimitValidator: This class """ + message = "This field is limited to {} words." code = "invalid" counter = re.compile(r"\w+\b", flags=re.IGNORECASE) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index ba4845d..dff8b69 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -56,9 +56,8 @@ class WildTypeSequence: """ class SequenceType: - """ + """ """ - """ DNA = "dna" PROTEIN = "protein" INFER = "infer" @@ -133,9 +132,8 @@ def choices(cls): return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] class Meta: - """ + """ """ - """ verbose_name = "Reference sequence" verbose_name_plural = "Reference sequences" @@ -519,7 +517,9 @@ def validate_map_has_unique_reference_genome(annotations): """ genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) if len(genomes) < len(annotations): - raise ValidationError("Each reference map must specify a different reference genome.") + raise ValidationError( + "Each reference map must specify a different reference genome." + ) def validate_map_has_at_least_one_interval(reference_map): @@ -538,7 +538,9 @@ def validate_map_has_at_least_one_interval(reference_map): If the reference_map does not have at least one interval. """ if not reference_map.get_intervals().count(): - raise ValidationError("You must specify at least one interval for each reference map.") + raise ValidationError( + "You must specify at least one interval for each reference map." + ) def validate_at_least_one_map(reference_maps): @@ -557,7 +559,9 @@ def validate_at_least_one_map(reference_maps): If the target does not have at least one reference map specified. """ if not len(reference_maps): - raise ValidationError("A target must have at least one reference map specified.") + raise ValidationError( + "A target must have at least one reference map specified." + ) def validate_one_primary_map(reference_maps): diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 22ebf15..f81b8fd 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -70,9 +70,7 @@ def validate_mavedb_urn(urn): If the MaveDB urn is not valid. """ if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError( - "%(urn)s is not a valid urn.", params={"urn": urn} - ) + raise ValidationError("%(urn)s is not a valid urn.", params={"urn": urn}) def validate_mavedb_urn_experimentset(urn): diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 4880f52..55b4027 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -18,7 +18,7 @@ required_score_column, null_values_list, null_values_re, - readable_null_values + readable_null_values, ) @@ -41,22 +41,17 @@ def is_null(value): class MaveDataset: # TODO - """ + """ """ - """ class DatasetType: # TODO - """ - - """ + """ """ SCORES = "scores" COUNTS = "counts" class HGVSColumns: # TODO - """ - - """ + """ """ NUCLEOTIDE: str = hgvs_nt_column TRANSCRIPT: str = hgvs_splice_column PROTEIN: str = hgvs_pro_column @@ -74,9 +69,8 @@ def options(cls) -> List[str]: class AdditionalColumns: # TODO - """ + """ """ - """ @classmethod def options(cls) -> List[str]: # TODO @@ -969,14 +963,11 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): # TODO - """ + """ """ - """ class AdditionalColumns: # TODO - """ - - """ + """ """ SCORES = required_score_column @classmethod @@ -1052,9 +1043,8 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): # TODO - """ + """ """ - """ @property def label(self) -> str: # TODO diff --git a/setup.py b/setup.py index 335703f..bb496c2 100644 --- a/setup.py +++ b/setup.py @@ -3,20 +3,19 @@ with open("README.md", "r") as fh: long_description = fh.read() -requirements = ["fqfa>=1.2.1", - "mavehgvs>=0.4.0", - "idutils>=1.1.0", - "pandas>=1.1.0", - ] +requirements = [ + "fqfa>=1.2.1", + "mavehgvs>=0.4.0", + "idutils>=1.1.0", + "pandas>=1.1.0", +] setuptools.setup( name="mavecore", version="0.1.0", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", - description=( - "MaveCore implements shared functionality for MaveTools and MaveDB." - ), + description=("MaveCore implements shared functionality for MaveTools and MaveDB."), long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/VariantEffect/MaveCore/", @@ -32,4 +31,4 @@ python_requires=">=3.6", install_requires=requirements, test_suite="tests", -) \ No newline at end of file +) diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation/test_dataset_validators.py index 3bca487..6b4895a 100644 --- a/tests/test_validation/test_dataset_validators.py +++ b/tests/test_validation/test_dataset_validators.py @@ -90,7 +90,9 @@ def test_raises_valuerror_when_null_values_in_column(self): header = read_header_from_io(file) validate_header_contains_no_null_columns(header) - def test_does_not_raise_valuerror_when_non_null_values_in_column(self,): + def test_does_not_raise_valuerror_when_non_null_values_in_column( + self, + ): file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) header = read_header_from_io(file) validate_header_contains_no_null_columns(header) # Should pass diff --git a/tests/test_validation/test_variant_validators/test_validators.py b/tests/test_validation/test_variant_validators/test_validators.py index bd144b9..495ca3e 100644 --- a/tests/test_validation/test_variant_validators/test_validators.py +++ b/tests/test_validation/test_variant_validators/test_validators.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import StringIO import unittest from unittest import TestCase from random import choice From e8b83986781e8c68d8661d32a172c848f6d3c5eb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:57:19 -0700 Subject: [PATCH 245/877] import ValidationError --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 03a3d6c..9eec939 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -5,7 +5,7 @@ from numpy.testing import assert_array_equal from mavecore.validation import constants - +from mavecore.validation.exceptions import ValidationError def is_null(value): """ From c474564ed56f56b3210e51a26e9f64e8c8d62f86 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:57:38 -0700 Subject: [PATCH 246/877] delete is_null function, import from utilities.py --- mavecore/validation/dataset_validators.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 9eec939..b4db3df 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -7,22 +7,7 @@ from mavecore.validation import constants from mavecore.validation.exceptions import ValidationError -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : str - The value to be checked as null or not. - - Returns - _______ - bool - True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null class WordLimitValidator: From 22b14202641b8568aa306b953d5724d7ede64c5c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:57:58 -0700 Subject: [PATCH 247/877] add attributes to contructor docstring --- mavecore/validation/dataset_validators.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index b4db3df..daeecde 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -14,6 +14,14 @@ class WordLimitValidator: """ This class + Attributes + __________ + message : str + Message template to describe how many words a field is limited to. + code : str + + counter : str + """ message = "This field is limited to {} words." code = "invalid" From 2649f0fcde77a10e3b18b0d34d80be5feb003057 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:59:09 -0700 Subject: [PATCH 248/877] substitute ValueError for ValidationError --- mavecore/validation/dataset_validators.py | 46 +++++++++++------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index daeecde..7a38d26 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -60,13 +60,13 @@ def __call__(self, value): Raises ______ - ValueError + ValidationError If """ if not value: return if len(self.counter.findall(value)) > self.word_limit: - raise ValueError(self.message.format(self.word_limit)) + raise ValidationError(self.message.format(self.word_limit)) def read_header_from_io(file, label=None, msg=None): @@ -90,7 +90,7 @@ def read_header_from_io(file, label=None, msg=None): Raises ______ - ValueError + ValidationError If a header could not be parsed from file. Columns must be coma delimited. Column names with commas must be escaped by enclosing them in double quotes. """ @@ -111,7 +111,7 @@ def read_header_from_io(file, label=None, msg=None): "Columns are comma delimited. Column names with commas must be" "escaped by enclosing them in double quotes.".format(label) ) - raise ValueError(msg) + raise ValidationError(msg) def validate_has_hgvs_in_header(header, label=None, msg=None): @@ -126,7 +126,7 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): Raises ______ - ValueError + ValidationError If """ if label is None: @@ -144,7 +144,7 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): "col_p": constants.hgvs_pro_column, } if not set(header) & set(constants.hgvs_columns): - raise ValueError(msg) + raise ValidationError(msg) def validate_at_least_one_additional_column(header, label=None, msg=None): @@ -164,7 +164,7 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): Raises ______ - ValueError + ValidationError If there are not additional columns in the header argument. """ if label is None: @@ -182,7 +182,7 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): ) ) params = {"label": label} - raise ValueError(msg) + raise ValidationError(msg) def validate_header_contains_no_null_columns(header, label=None, msg=None): @@ -200,7 +200,7 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): Raises ______ - ValueError + ValidationError If the file header contains blank/empty/whitespace. Only columns or the case-insensitive null values listed in constants.readable_null_values are permitted. @@ -215,7 +215,7 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): "only columns or the following case-insensitive null " "values: {}.".format(label, ", ".join(constants.readable_null_values)) ) - raise ValueError(msg) + raise ValidationError(msg) def validate_datasets_define_same_variants(scores, counts): @@ -232,7 +232,7 @@ def validate_datasets_define_same_variants(scores, counts): Raises ______ - ValueError + ValidationError If score and counts files do not define the same variants. """ try: @@ -249,7 +249,7 @@ def validate_datasets_define_same_variants(scores, counts): counts[constants.hgvs_pro_column].sort_values().values, ) except AssertionError: - raise ValueError( + raise ValidationError( "Your score and counts files do not define the same variants. " "Check that the hgvs columns in both files match." ) @@ -278,7 +278,7 @@ def validate_scoreset_score_data_input(file): validate_at_least_one_additional_column(header, label="Score") if constants.required_score_column not in header: - raise ValueError( + raise ValidationError( "Score data file is missing the required column " + constants.required_score_column + "." @@ -316,37 +316,37 @@ def validate_scoreset_json(dict_): Raises ______ - ValueError + ValidationError If scoreset data is missing the required key. - ValueError + ValidationError If header values are not strings. - ValueError + ValidationError If - ValueError + ValidationError If missing required column constants.required_score_column for score dataset. - ValueError + ValidationError If encountered unexpected keys extras. """ required_columns = [constants.score_columns, constants.count_columns] for key in required_columns: if key not in dict_.keys(): - raise ValueError("Scoreset data is missing the required key " + key) + raise ValidationError("Scoreset data is missing the required key " + key) columns = dict_[key] if not all([isinstance(c, str) for c in columns]): - raise ValueError("Header values must be strings.") + raise ValidationError("Header values must be strings.") if not isinstance(columns, list): type_ = type(columns).__name__ - raise ValueError( + raise ValidationError( "Value for " + key.replace("_", " ") + " must be a list not " + type_ ) # Check score columns is not-empty and at least contains hgvs and score if key == constants.score_columns: if constants.required_score_column not in columns: - raise ValueError( + raise ValidationError( "Missing required column constants.required_score_column " "for score dataset." ) @@ -356,4 +356,4 @@ def validate_scoreset_json(dict_): extras = [k for k in dict_.keys() if k not in set(required_columns)] if len(extras) > 0: extras = [k for k in dict_.keys() if k not in required_columns] - raise ValueError("Encountered unexpected keys extras") + raise ValidationError("Encountered unexpected keys extras") From 0a18bc2b037b982c176167df223ec217390c7028 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:00:38 -0700 Subject: [PATCH 249/877] delete is_null, import is_null from utilities.py --- mavecore/validation/genome_validators.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index ba4845d..d506a6d 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -14,23 +14,7 @@ from mavecore.validation import constants - -def is_null(value): - """ - This function checks if the value exists or is null. - - Parameters - __________ - value : - The value to be checked. - - Returns - _______ - bool - True if a stripped/lowercase value in `nan_col_values`. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null # min_start_validator = MinValueValidator( From ab8890d33e05dcdbde8c3a089d1687338ebcbb05 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:00:53 -0700 Subject: [PATCH 250/877] import re --- mavecore/validation/variant_validators/hgvs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 5f19735..f77f87c 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -1,6 +1,5 @@ from functools import partial from typing import Optional, Union -import re from mavehgvs import Variant, MaveHgvsParseError from mavecore.validation.exceptions import ValidationError From b92a6eb46ec769aefc5c4d46c4009a303cfbd388 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:01:24 -0700 Subject: [PATCH 251/877] import constants --- mavecore/validation/variant_validators/hgvs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index f77f87c..a1e7510 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -4,8 +4,6 @@ from mavehgvs import Variant, MaveHgvsParseError from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import NA_value, null_values_re - from mavecore.validation.constants import ( hgvs_nt_column, hgvs_splice_column, From d8c57175fbcb3a1a64e8eb615942da1b35a569c9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:02:08 -0700 Subject: [PATCH 252/877] delete is_null function and import is_null from utilities.py --- mavecore/validation/variant_validators/hgvs.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index a1e7510..2ea185c 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -11,22 +11,7 @@ ) -# from core.utilities import is_null -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value - - Returns - _______ - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - +from mavecore.validation.utilities import is_null def validate_hgvs_string( value: Union[str, bytes], From 335bb0db336124b8465475809d0e8a3293232be5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:02:57 -0700 Subject: [PATCH 253/877] add parameter types to docstring --- mavecore/validation/variant_validators/hgvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 2ea185c..10cd7ca 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -24,8 +24,8 @@ def validate_hgvs_string( Parameters __________ - value : - column : + value : Union[str, bytes] + column : Optional[str] = None splice_present : targetseq : relaxed_ordering : From 314b4d7133f3bdef8553d2ffa271cb84835720c8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:04:04 -0700 Subject: [PATCH 254/877] delete is_null function and import from utilities.py --- mavecore/validation/metadata_validators.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 3c9d5d1..41e4079 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,27 +1,7 @@ import idutils from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import null_values_re - - -def is_null(value): - # TODO - # check that parameter type is accurate - """ - This function checks that the passed value is null. - - Parameters - __________ - value : str - Value to be checked if null. - - Returns - _______ - bool - True if a stripped/lowercase value in in `nan_col_values`. - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null def validate_sra_identifier(identifier): From fe538550690e25c2090ee5dca6112429c7dbe7de Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Mar 2022 16:04:42 -0700 Subject: [PATCH 255/877] make utilities file and add is_null to file --- mavecore/validation/utilities.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 mavecore/validation/utilities.py diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py new file mode 100644 index 0000000..4ea0480 --- /dev/null +++ b/mavecore/validation/utilities.py @@ -0,0 +1,18 @@ +from mavecore.validation.constants import null_values_re + +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : str + The value to be checked as null or not. + + Returns + _______ + bool + True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. + """ + value = str(value).strip().lower() + return null_values_re.fullmatch(value) or not value \ No newline at end of file From cc50adfaa18166dcecede01cdef0931e902c484b Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 29 Mar 2022 10:24:24 +1100 Subject: [PATCH 256/877] Update --- .pypirc | 3 + mavecore/validation/dataset_validators.py | 182 +++++++++++++++++++--- setup.py | 2 +- 3 files changed, 167 insertions(+), 20 deletions(-) create mode 100644 .pypirc diff --git a/.pypirc b/.pypirc new file mode 100644 index 0000000..1e8a23e --- /dev/null +++ b/.pypirc @@ -0,0 +1,3 @@ +[pypi] +username = __token__ +password = pypi-AgEIcHlwaS5vcmcCJDBlOGU0OTVhLWFmNzctNDM4Zi1hOTc1LWMyNDM2ZDU5YWU1OQACJXsicGVybWlzc2lvbnMiOiAidXNlciIsICJ2ZXJzaW9uIjogMX0AAAYgJ1KJf63SpxtDo0U_108z8AO02KzJsyaYFgnJTL15jWk \ No newline at end of file diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7a72e0e..5a13192 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -5,20 +5,45 @@ from numpy.testing import assert_array_equal from mavecore.validation import constants +from mavecore.validation.exceptions import ValidationError - -def is_null(value): - """Returns True if a stripped/lowercase value in in `nan_col_values`.""" - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null class WordLimitValidator: + """ + This class + + Attributes + __________ + message : str + Message template to describe how many words a field is limited to. + code : str + + counter : str + + """ + message = "This field is limited to {} words." code = "invalid" counter = re.compile(r"\w+\b", flags=re.IGNORECASE) def __init__(self, word_limit, message=None, code=None): + # TODO + # check the code parameter type + """ + This constructor sets the values of the WordLimitValidator class attributes + message, code, and counter. + + Parameters + __________ + word_limit : int + The word limit assigned to the word limit attribute. + message : str + (default = None) The message assigned to the message attribute. + code : + (default = None) The code assigned to the code attribute. + """ if message is not None: self.message = message if code is not None: @@ -26,13 +51,50 @@ def __init__(self, word_limit, message=None, code=None): self.word_limit = int(word_limit) def __call__(self, value): + """ + Parameters + __________ + value : + + Returns + _______ + + Raises + ______ + ValidationError + If + """ if not value: return if len(self.counter.findall(value)) > self.word_limit: - raise ValueError(self.message.format(self.word_limit)) + raise ValidationError(self.message.format(self.word_limit)) def read_header_from_io(file, label=None, msg=None): + # TODO + # confirm types for parameters + """ + This takes a file and reads the header from that file. + + Parameters + __________ + file : + label : str + (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. + + Returns + _______ + str + The header that was read from io. + + Raises + ______ + ValidationError + If a header could not be parsed from file. Columns must be coma delimited. Column names + with commas must be escaped by enclosing them in double quotes. + """ if label is None: label = "uploaded" @@ -50,10 +112,24 @@ def read_header_from_io(file, label=None, msg=None): "Columns are comma delimited. Column names with commas must be" "escaped by enclosing them in double quotes.".format(label) ) - raise ValueError(msg) + raise ValidationError(msg) def validate_has_hgvs_in_header(header, label=None, msg=None): + """ + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValidationError + If + """ if label is None: label = "Uploaded" params = {} @@ -69,10 +145,29 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): "col_p": constants.hgvs_pro_column, } if not set(header) & set(constants.hgvs_columns): - raise ValueError(msg) + raise ValidationError(msg) def validate_at_least_one_additional_column(header, label=None, msg=None): + # TODO + # verify parameter types + """ + This function checks the passed header to see if there exists additional columns besides the three + specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. + + Parameters + __________ + header : + label : + default = None + msg : + default = None + + Raises + ______ + ValidationError + If there are not additional columns in the header argument. + """ if label is None: label = "Uploaded" params = {} @@ -88,10 +183,29 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): ) ) params = {"label": label} - raise ValueError(msg) + raise ValidationError(msg) def validate_header_contains_no_null_columns(header, label=None, msg=None): + """ + This function checks that the header parameter does not contain any null columns that + are not in the case-insensitive null values listed in constants.readable_null_values. + + Parameters + __________ + header : + label : + (default = None) + msg : + (default = None) + + Raises + ______ + ValidationError + If the file header contains blank/empty/whitespace. Only columns or the + case-insensitive null values listed in constants.readable_null_values + are permitted. + """ if label is None: label = "File" any_null = any([is_null(v) for v in header]) @@ -100,9 +214,11 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): msg = ( "%(label)s file header cannot contain blank/empty/whitespace " "only columns or the following case-insensitive null " - "values: {}.".format(label, ", ".join(constants.readable_null_values)) + "values: {}.".format( + label, ", ".join(constants.readable_null_values) + ) ) - raise ValueError(msg) + raise ValidationError(msg) def validate_datasets_define_same_variants(scores, counts): @@ -116,6 +232,11 @@ def validate_datasets_define_same_variants(scores, counts): Scores dataframe parsed from an uploaded scores file. counts : `pd.DataFrame` Scores dataframe parsed from an uploaded counts file. + + Raises + ______ + ValidationError + If score and counts files do not define the same variants. """ try: assert_array_equal( @@ -131,7 +252,7 @@ def validate_datasets_define_same_variants(scores, counts): counts[constants.hgvs_pro_column].sort_values().values, ) except AssertionError: - raise ValueError( + raise ValidationError( "Your score and counts files do not define the same variants. " "Check that the hgvs columns in both files match." ) @@ -147,6 +268,11 @@ def validate_scoreset_score_data_input(file): ---------- file : :class:`io.FileIO` An open file handle in read mode. + + Raises + ______ + ValidationError + If score data file is missing the required column constants.required_score_column """ file.seek(0) header = read_header_from_io(file, label="Score") @@ -155,7 +281,7 @@ def validate_scoreset_score_data_input(file): validate_at_least_one_additional_column(header, label="Score") if constants.required_score_column not in header: - raise ValueError( + raise ValidationError( "Score data file is missing the required column " + constants.required_score_column + "." @@ -190,27 +316,45 @@ def validate_scoreset_json(dict_): ---------- dict_ : dict Dictionary of keys mapping to a list. + + Raises + ______ + ValidationError + If scoreset data is missing the required key. + ValidationError + If header values are not strings. + ValidationError + If + ValidationError + If missing required column constants.required_score_column for score dataset. + ValidationError + If encountered unexpected keys extras. """ required_columns = [constants.score_columns, constants.count_columns] for key in required_columns: if key not in dict_.keys(): - raise ValueError("Scoreset data is missing the required key " + key) + raise ValidationError( + "Scoreset data is missing the required key " + key + ) columns = dict_[key] if not all([isinstance(c, str) for c in columns]): - raise ValueError("Header values must be strings.") + raise ValidationError("Header values must be strings.") if not isinstance(columns, list): type_ = type(columns).__name__ - raise ValueError( - "Value for " + key.replace("_", " ") + " must be a list not " + type_ + raise ValidationError( + "Value for " + + key.replace("_", " ") + + " must be a list not " + + type_ ) # Check score columns is not-empty and at least contains hgvs and score if key == constants.score_columns: if constants.required_score_column not in columns: - raise ValueError( + raise ValidationError( "Missing required column constants.required_score_column " "for score dataset." ) @@ -220,4 +364,4 @@ def validate_scoreset_json(dict_): extras = [k for k in dict_.keys() if k not in set(required_columns)] if len(extras) > 0: extras = [k for k in dict_.keys() if k not in required_columns] - raise ValueError("Encountered unexpected keys extras") + raise ValidationError("Encountered unexpected keys extras") diff --git a/setup.py b/setup.py index d4aeb26..11669b1 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setuptools.setup( name="mavecore", - version="0.1.0", + version="0.1.1", author="Daniel Esposito and Alan F Rubin", author_email="alan.rubin@wehi.edu.au", description=( From 66f52cf99d24802cf5fa7ba702027f38b43d660f Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Mar 2022 12:03:42 +1100 Subject: [PATCH 257/877] remove unused constants --- mavecore/validation/constants.py | 45 ------------------- .../validation/variant_validators/hgvs.py | 3 +- 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/mavecore/validation/constants.py b/mavecore/validation/constants.py index 6630323..611d392 100644 --- a/mavecore/validation/constants.py +++ b/mavecore/validation/constants.py @@ -24,26 +24,6 @@ "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() ] + ["whitespace"] -""" -Sequence constants -""" -AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" -DNA_LETTERS = "ATCG" - -DNA_SEQ_PATTERN = rf"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = rf"[{AA_LETTERS}]+" - - -""" -Constant definitions for application `experiment`. -""" -from mavecore.validation.urn_validators import ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, -) - hgvs_nt_column = "hgvs_nt" hgvs_splice_column = "hgvs_splice" hgvs_pro_column = "hgvs_pro" @@ -55,21 +35,6 @@ variant_count_data = "count_data" required_score_column = "score" -experimentset_url_pattern = "|".join( - [MAVEDB_EXPERIMENTSET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -experiment_url_pattern = "|".join( - [MAVEDB_EXPERIMENT_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -scoreset_url_pattern = "|".join( - [MAVEDB_SCORESET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) - -any_url_pattern = "|".join( - [experimentset_url_pattern, experiment_url_pattern, scoreset_url_pattern] -) - - valid_dataset_columns = [score_columns, count_columns] valid_variant_columns = [variant_score_data, variant_count_data] @@ -78,13 +43,3 @@ variant_count_data: count_columns, } scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} - -# Celery dataset status -processing = "processing" -failed = "failed" -success = "success" - -# User roles -administrator = "administrator" -editor = "editor" -viewer = "viewer" diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 5f19735..b160e0d 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -1,11 +1,10 @@ from functools import partial from typing import Optional, Union -import re from mavehgvs import Variant, MaveHgvsParseError from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import NA_value, null_values_re +from mavecore.validation.constants import null_values_re from mavecore.validation.constants import ( hgvs_nt_column, From 1c0a81a06ba4ecd5ac85d3bdb89ca6640d3993b3 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Mar 2022 12:03:56 +1100 Subject: [PATCH 258/877] rename NA constant --- mavecore/original_validation/variant_validators/hgvs.py | 2 +- mavecore/validation/constants.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/original_validation/variant_validators/hgvs.py b/mavecore/original_validation/variant_validators/hgvs.py index 2a57a39..3f0c043 100644 --- a/mavecore/original_validation/variant_validators/hgvs.py +++ b/mavecore/original_validation/variant_validators/hgvs.py @@ -4,7 +4,7 @@ from mavehgvs import Variant, MaveHgvsParseError from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import NA_value, null_values_re +from mavecore.validation.constants import NA_STRING, null_values_re from mavecore.validation.constants import ( hgvs_nt_column, diff --git a/mavecore/validation/constants.py b/mavecore/validation/constants.py index 611d392..38cc8d4 100644 --- a/mavecore/validation/constants.py +++ b/mavecore/validation/constants.py @@ -3,7 +3,7 @@ """ Null Constant definitions """ -NA_value = "NA" +NA_STRING = "NA" null_values_list = ( "nan", "na", @@ -13,11 +13,11 @@ "n/a", "null", "nil", - NA_value, + NA_STRING, ) null_values_re = re.compile( - r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_value), flags=re.IGNORECASE + r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_STRING), flags=re.IGNORECASE ) readable_null_values = [ From fa3fafa5caa80c2d77c13ae994d66f10025fd01d Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Mar 2022 12:23:05 +1100 Subject: [PATCH 259/877] consistent naming for readable_null_values_list --- mavecore/original_validation/dataset_validators.py | 4 +++- .../original_validation/variant_validators/dataset.py | 6 +++--- mavecore/validation/constants.py | 4 +--- mavecore/validation/dataset_validators.py | 9 ++++++--- mavecore/validation/variant_validators/dataset.py | 6 +++--- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/mavecore/original_validation/dataset_validators.py b/mavecore/original_validation/dataset_validators.py index 3e5c464..fd7a20e 100644 --- a/mavecore/original_validation/dataset_validators.py +++ b/mavecore/original_validation/dataset_validators.py @@ -229,7 +229,9 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): msg = ( "%(label)s file header cannot contain blank/empty/whitespace " "only columns or the following case-insensitive null " - "values: {}.".format(label, ", ".join(constants.readable_null_values)) + "values: {}.".format( + label, ", ".join(constants.readable_null_values_list) + ) ) raise ValueError(msg) diff --git a/mavecore/original_validation/variant_validators/dataset.py b/mavecore/original_validation/variant_validators/dataset.py index 3b67cd5..9461a0e 100644 --- a/mavecore/original_validation/variant_validators/dataset.py +++ b/mavecore/original_validation/variant_validators/dataset.py @@ -18,7 +18,7 @@ required_score_column, null_values_list, null_values_re, - readable_null_values, + readable_null_values_list, ) @@ -496,7 +496,7 @@ def _validate_columns(self) -> "MaveDataset": self._errors.append( f"Column names in your {self.label} file cannot values " f"considered null such as the following: " - f"{', '.join(readable_null_values)}" + f"{', '.join(readable_null_values_list)}" ) columns = [c for c in columns if not is_null(c)] @@ -724,7 +724,7 @@ def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset self._errors.append( f"Primary column (inferred as '{self._index_column}') " f"cannot contain any null values from " - f"{', '.join(readable_null_values)} (case-insensitive)" + f"{', '.join(readable_null_values_list)} (case-insensitive)" ) if not allow_duplicates: diff --git a/mavecore/validation/constants.py b/mavecore/validation/constants.py index 38cc8d4..292d296 100644 --- a/mavecore/validation/constants.py +++ b/mavecore/validation/constants.py @@ -20,9 +20,7 @@ r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_STRING), flags=re.IGNORECASE ) -readable_null_values = [ - "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() -] + ["whitespace"] +readable_null_values_list = [f"'{s}'" for s in null_values_list] + ["whitespace"] hgvs_nt_column = "hgvs_nt" hgvs_splice_column = "hgvs_splice" diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index b8585f4..ca23c29 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,7 +9,7 @@ def is_null(value): """ - Returns True if a stripped/lowercase value in in `nan_col_values`. + Checks if a stripped/lowercase value is one of the recognized NA or NULL string values. Parameters __________ @@ -19,7 +19,8 @@ def is_null(value): Returns _______ bool - True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. + True value is NoneType, is an empty string, or if value matches the stated regex patterns in + constants.null_values_re. """ value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value @@ -221,7 +222,9 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): msg = ( "%(label)s file header cannot contain blank/empty/whitespace " "only columns or the following case-insensitive null " - "values: {}.".format(label, ", ".join(constants.readable_null_values)) + "values: {}.".format( + label, ", ".join(constants.readable_null_values_list) + ) ) raise ValueError(msg) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 55b4027..6c5dfe7 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -18,7 +18,7 @@ required_score_column, null_values_list, null_values_re, - readable_null_values, + readable_null_values_list, ) @@ -523,7 +523,7 @@ def _validate_columns(self) -> "MaveDataset": self._errors.append( f"Column names in your {self.label} file cannot values " f"considered null such as the following: " - f"{', '.join(readable_null_values)}" + f"{', '.join(readable_null_values_list)}" ) columns = [c for c in columns if not is_null(c)] @@ -756,7 +756,7 @@ def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset self._errors.append( f"Primary column (inferred as '{self._index_column}') " f"cannot contain any null values from " - f"{', '.join(readable_null_values)} (case-insensitive)" + f"{', '.join(readable_null_values_list)} (case-insensitive)" ) if not allow_duplicates: From c9de13acb273c11623ac703c669f05defdb679c5 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Tue, 29 Mar 2022 12:23:24 +1100 Subject: [PATCH 260/877] make null value definition less redundant --- mavecore/validation/constants.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/constants.py b/mavecore/validation/constants.py index 292d296..f7bfe47 100644 --- a/mavecore/validation/constants.py +++ b/mavecore/validation/constants.py @@ -13,11 +13,17 @@ "n/a", "null", "nil", - NA_STRING, ) +# enforce the assumption that these are all lowercase values +null_values_list = [s.lower() for s in null_values_list] +# add the NA_STRING only if it's not already in the list +if NA_STRING.lower() not in null_values_list: + null_values_list.append(NA_STRING.lower()) +null_values_list.sort() null_values_re = re.compile( - r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_STRING), flags=re.IGNORECASE + r"^\s+$|" + "|".join(f"^{s}$" for s in null_values_list if len(s)), + flags=re.IGNORECASE, ) readable_null_values_list = [f"'{s}'" for s in null_values_list] + ["whitespace"] From 943fd876d514552d5b2548cdeede139d47c281a1 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 29 Mar 2022 12:36:44 +1100 Subject: [PATCH 261/877] Delete .pypirc --- .pypirc | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .pypirc diff --git a/.pypirc b/.pypirc deleted file mode 100644 index 1e8a23e..0000000 --- a/.pypirc +++ /dev/null @@ -1,3 +0,0 @@ -[pypi] -username = __token__ -password = pypi-AgEIcHlwaS5vcmcCJDBlOGU0OTVhLWFmNzctNDM4Zi1hOTc1LWMyNDM2ZDU5YWU1OQACJXsicGVybWlzc2lvbnMiOiAidXNlciIsICJ2ZXJzaW9uIjogMX0AAAYgJ1KJf63SpxtDo0U_108z8AO02KzJsyaYFgnJTL15jWk \ No newline at end of file From 46cf82d1769006cd8bb91f50fd0505950a728120 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 29 Mar 2022 12:37:02 +1100 Subject: [PATCH 262/877] Update --- mavecore/validation/utilities.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 mavecore/validation/utilities.py diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py new file mode 100644 index 0000000..d9bd645 --- /dev/null +++ b/mavecore/validation/utilities.py @@ -0,0 +1,19 @@ +from mavecore.validation.constants import null_values_re + + +def is_null(value): + """ + Returns True if a stripped/lowercase value in in `nan_col_values`. + + Parameters + __________ + value : str + The value to be checked as null or not. + + Returns + _______ + bool + True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. + """ + value = str(value).strip().lower() + return null_values_re.fullmatch(value) or not value From f07d77d35faf39a7aade17b3f6adab32bc210619 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 29 Mar 2022 12:41:39 +1100 Subject: [PATCH 263/877] Update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 11669b1..0defd7b 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setuptools.setup( name="mavecore", - version="0.1.1", + version="0.1.2", author="Daniel Esposito and Alan F Rubin", author_email="alan.rubin@wehi.edu.au", description=( From 8b0dea53614a22f9ae742e388702f0a1d8d979b9 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Thu, 31 Mar 2022 16:09:25 +1100 Subject: [PATCH 264/877] Change urn_validators.py error information. --- mavecore/validation/urn_validators.py | 30 +++++++++++---------------- setup.py | 2 +- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index f44f760..733f724 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -57,39 +57,33 @@ def validate_mavedb_urn(urn): if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid urn.", params={"urn": urn} - ) + raise ValidationError("{} is not a valid urn.".format(urn)) def validate_mavedb_urn_experimentset(urn): - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not ( + MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn) + ): raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} + # "Error test" + "{}'s is not a valid Experiment Set urn.".format(urn) ) def validate_mavedb_urn_experiment(urn): - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not ( + MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn) + ): raise ValidationError( - "Error test" - # "%(urn)s is not a valid Experiment urn.", params={"urn": urn} + "{}'s is not a valid Experiment urn.".format(urn) ) def validate_mavedb_urn_scoreset(urn): if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid score set urn.", params={"urn": urn} - ) + raise ValidationError("{}'s is not a valid score set urn.".format(urn)) def validate_mavedb_urn_variant(urn): if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "Error test" - # "%(urn)s is not a valid Variant urn.", params={"urn": urn} - ) + raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) diff --git a/setup.py b/setup.py index 0defd7b..2ad6391 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setuptools.setup( name="mavecore", - version="0.1.2", + version="0.1.3", author="Daniel Esposito and Alan F Rubin", author_email="alan.rubin@wehi.edu.au", description=( From a4d623963560f12a287c8363a45699fe8db65ca7 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Fri, 1 Apr 2022 11:29:27 +1100 Subject: [PATCH 265/877] Change a typo. --- mavecore/validation/urn_validators.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 733f724..68b14bb 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -57,7 +57,7 @@ def validate_mavedb_urn(urn): if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError("{} is not a valid urn.".format(urn)) + raise ValidationError("{}'s is not a valid urn.".format(urn)) def validate_mavedb_urn_experimentset(urn): diff --git a/setup.py b/setup.py index 2ad6391..4deb12d 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setuptools.setup( name="mavecore", version="0.1.3", - author="Daniel Esposito and Alan F Rubin", + author="Daniel Esposito, Alan F Rubin", author_email="alan.rubin@wehi.edu.au", description=( "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." From 8b5ec1b0fc1abec866b9fb7634e6e0ccdf6f87d1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 17:31:48 -0700 Subject: [PATCH 266/877] mark function as not Django dependent --- mavecore/validation/variant_validators/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 4880f52..635106c 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -23,7 +23,7 @@ def is_null(value): - # TODO + # TODO this is not Django dependent """ Returns True if a stripped/lowercase value in in `nan_col_values`. @@ -34,6 +34,7 @@ def is_null(value): Returns _______ bool + """ value = str(value).strip().lower() return null_values_re.fullmatch(value) or not value From f3139bc6164eab074a1bbec62c9b44602f0a6fd8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 17:32:02 -0700 Subject: [PATCH 267/877] change error type --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7a38d26..854ac8b 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -268,7 +268,7 @@ def validate_scoreset_score_data_input(file): Raises ______ - ValueError + ValidationError If score data file is missing the required column constants.required_score_column """ file.seek(0) From 811257b37c0e1f63f6c54f9169d5666f8efd8646 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 17:35:25 -0700 Subject: [PATCH 268/877] edit setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ac68b6b..45d528e 100644 --- a/setup.py +++ b/setup.py @@ -12,14 +12,14 @@ setuptools.setup( name="mavecore", version="0.1.0", - author="Daniel Esposito and Alan F Rubin", + author="MaveDB developers", author_email="alan.rubin@wehi.edu.au", description=( "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." ), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/tree/add_validation", + url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", packages=setuptools.find_packages(), classifiers=[ "Development Status :: 3 - Alpha", From 265eb6e237c33f19e23b473d9303db4d186fb6a1 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:26:40 +1100 Subject: [PATCH 269/877] update requirements --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 4deb12d..455bb09 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,13 @@ import setuptools -import sys with open("README.md", "r") as fh: long_description = fh.read() -requirements = ["fqfa>=1.2.1"] -# fqfa requires backported dataclasses in Python 3.6 -if sys.version_info.major == 3 and sys.version_info.minor == 6: - requirements.append("dataclasses") +requirements = ["fqfa>=1.2.1", + "mavehgvs>=0.4.0", + "idutils>=1.1.0", + "pandas>=1.1.0", + ] setuptools.setup( name="mavecore", From 6a2550a81a1489a0cf3927847d70d818cb4bbcec Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:26:53 +1100 Subject: [PATCH 270/877] update setup metadata --- setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 455bb09..335703f 100644 --- a/setup.py +++ b/setup.py @@ -11,18 +11,18 @@ setuptools.setup( name="mavecore", - version="0.1.3", - author="Daniel Esposito, Alan F Rubin", + version="0.1.0", + author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", description=( - "MaveCore is to create a new dependency that contains all the shared functionality for MaveTools and MaveDB." + "MaveCore implements shared functionality for MaveTools and MaveDB." ), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", + url="https://github.com/VariantEffect/MaveCore/", packages=setuptools.find_packages(), classifiers=[ - "Development Status :: 3 - Alpha", + "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Bio-Informatics", "License :: OSI Approved :: BSD License", @@ -32,4 +32,4 @@ python_requires=">=3.6", install_requires=requirements, test_suite="tests", -) +) \ No newline at end of file From 155c232d46a96c661a26729fc6f7ae54fba9e17c Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 28 Mar 2022 15:28:14 +1100 Subject: [PATCH 271/877] add pycharm files to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b6e4761..bd6ad26 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea/ From 5f52fbc894b9cb01aa67e57ca361b5fefd650133 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 18:01:04 -0700 Subject: [PATCH 272/877] remove imports from requirements.txt --- requirements.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1424f39..5efa4f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,2 @@ -pandas~=1.4.1 -mavehgvs~=0.4.0 numpy~=1.22.2 -fqfa~=1.2.1 -IDUtils~=1.1.12 setuptools~=60.9.3 \ No newline at end of file From a6be452eedbd8dba1d39e7d5761a021cea937193 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:18:41 -0700 Subject: [PATCH 273/877] remove imports from requirements.txt --- .../validation/variant_validators/dataset.py | 97 +++++++++---------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 635106c..93b413f 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -23,7 +23,6 @@ def is_null(value): - # TODO this is not Django dependent """ Returns True if a stripped/lowercase value in in `nan_col_values`. @@ -41,12 +40,12 @@ def is_null(value): class MaveDataset: - # TODO + # TODO Django dependent """ """ class DatasetType: - # TODO + # TODO Django dependent """ """ @@ -54,7 +53,7 @@ class DatasetType: COUNTS = "counts" class HGVSColumns: - # TODO + # TODO Django dependent """ """ @@ -64,7 +63,7 @@ class HGVSColumns: @classmethod def options(cls) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -74,13 +73,13 @@ def options(cls) -> List[str]: return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: - # TODO + # TODO Django dependent """ """ @classmethod def options(cls) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -92,7 +91,7 @@ def options(cls) -> List[str]: # ---------------------- Construction------------------------------------ # @classmethod def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - # TODO + # TODO Django dependent """ Parameters @@ -108,7 +107,7 @@ def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": @classmethod def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - # TODO + # TODO Django dependent """ Parameters @@ -125,7 +124,7 @@ def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": def _for_type( cls, file: Union[str, TextIO, BinaryIO], dataset_type: str ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - # TODO + # TODO Django dependent """ Parameters @@ -188,7 +187,7 @@ def _for_type( # ---------------------- Public ----------------------------------------- # @property def label(self) -> str: - # TODO + # TODO Django dependent """ Returns @@ -199,7 +198,7 @@ def label(self) -> str: @property def is_valid(self) -> Optional[bool]: - # TODO + # TODO Django dependent """ Returns @@ -212,7 +211,7 @@ def is_valid(self) -> Optional[bool]: @property def n_errors(self) -> Optional[int]: - # TODO + # TODO Django dependent """ Returns @@ -225,7 +224,7 @@ def n_errors(self) -> Optional[int]: @property def errors(self) -> Optional[List[str]]: - # TODO + # TODO Django dependent """ Returns @@ -236,7 +235,7 @@ def errors(self) -> Optional[List[str]]: @property def is_empty(self) -> bool: - # TODO + # TODO Django dependent """ Returns @@ -247,7 +246,7 @@ def is_empty(self) -> bool: @property def columns(self) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -258,7 +257,7 @@ def columns(self) -> List[str]: @property def hgvs_columns(self) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -269,7 +268,7 @@ def hgvs_columns(self) -> List[str]: @property def non_hgvs_columns(self) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -280,7 +279,7 @@ def non_hgvs_columns(self) -> List[str]: @property def n_rows(self) -> int: - # TODO + # TODO Django dependent """ Returns @@ -291,7 +290,7 @@ def n_rows(self) -> int: @property def n_columns(self) -> int: - # TODO + # TODO Django dependent """ Returns @@ -302,7 +301,7 @@ def n_columns(self) -> int: @property def index_column(self) -> Optional[str]: - # TODO + # TODO Django dependent """ Returns @@ -315,7 +314,7 @@ def index_column(self) -> Optional[str]: @property def index(self) -> Optional[pd.Index]: - # TODO + # TODO Django dependent """ Returns @@ -327,7 +326,7 @@ def index(self) -> Optional[pd.Index]: return self._df.index.copy(deep=True) def data(self, serializable=False) -> pd.DataFrame: - # TODO + # TODO Django dependent """ Return underlying dataframe object. @@ -349,7 +348,7 @@ def data(self, serializable=False) -> pd.DataFrame: return self._df.copy(deep=True) def match_other(self, other: "MaveDataset") -> Optional[bool]: - # TODO + # TODO Django dependent """ Check that each dataset defined the same variants in each column. @@ -376,7 +375,7 @@ def match_other(self, other: "MaveDataset") -> Optional[bool]: ) def to_dict(self) -> Dict[str, Dict]: - # TODO + # TODO Django dependent """ Returns underlying dataframe as dictionary in 'records' orientation. Keys will be index values and values will be an inner dictionary mapping @@ -398,7 +397,7 @@ def validate( relaxed_ordering: bool = False, allow_index_duplicates: bool = False, ) -> "MaveDataset": - # TODO + # TODO Django dependent """ Parameters @@ -453,7 +452,7 @@ def __init__( index_column: Optional[str] = None, errors: Optional[List[str]] = None, ): - # TODO + # TODO Django dependent """ Parameters @@ -470,7 +469,7 @@ def __init__( self._errors = None if errors is None else list(errors) def __repr__(self): - # TODO + # TODO Django dependent """ Returns @@ -488,7 +487,7 @@ def __repr__(self): @property def _column_order(self) -> Dict[str, int]: - # TODO + # TODO Django dependent """ Returns @@ -509,7 +508,7 @@ def _column_order(self) -> Dict[str, int]: ) def _validate_columns(self) -> "MaveDataset": - # TODO + # TODO Django dependent """ Returns @@ -561,7 +560,7 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": - # TODO + # TODO Django dependent """ Returns @@ -585,7 +584,7 @@ def _normalize_data(self) -> "MaveDataset": def _validate_genomic_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": - # TODO + # TODO Django dependent """ Parameters @@ -639,7 +638,7 @@ def _validate_genomic_variants( def _validate_transcript_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": - # TODO + # TODO Django dependent """ Parameters @@ -686,7 +685,7 @@ def _validate_transcript_variants( def _validate_protein_variants( self, targetseq: Optional[str] = None, relaxed_ordering: bool = False ) -> "MaveDataset": - # TODO + # TODO Django dependent """ Parameters @@ -738,7 +737,7 @@ def _validate_protein_variants( return self def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - # TODO + # TODO Django dependent """ Parameters @@ -790,7 +789,7 @@ def _validate_variants( targetseq: Optional[str] = None, relaxed_ordering: bool = False, ) -> Tuple[pd.Series, Set[str], List[str]]: - # TODO + # TODO Django dependent """ Parameters @@ -813,7 +812,7 @@ def _validate_variants( errors = [] def validate_variant(variant: str): - # TODO + # TODO Django dependent # TODO: logic mirrors that in validate_hgvs_string, which is kept # as a standalone function for backwards compatibility with # django's model validator field. Merge at some point. @@ -861,7 +860,7 @@ def validate_variant(variant: str): return validated_variants, prefixes, errors def _column_is_null(self, column) -> bool: - # TODO + # TODO Django dependent """ Parameters @@ -875,7 +874,7 @@ def _column_is_null(self, column) -> bool: return len(self._df[self._df[column].isna()]) == len(self._df) def _column_is_partially_null(self, column) -> bool: - # TODO + # TODO Django dependent """ Parameters @@ -889,7 +888,7 @@ def _column_is_partially_null(self, column) -> bool: return 0 < len(self._df[self._df[column].isna()]) < len(self._df) def _column_is_fully_specified(self, column) -> bool: - # TODO + # TODO Django dependent """ Parameters @@ -905,7 +904,7 @@ def _column_is_fully_specified(self, column) -> bool: def _validate_variant_prefix_for_column( self, variant: Variant, prefix: str, column: str, splice_defined: bool ) -> Optional[str]: - # TODO + # TODO Django dependent """ Parameters @@ -969,12 +968,12 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): - # TODO + # TODO Django dependent """ """ class AdditionalColumns: - # TODO + # TODO Django dependent """ """ @@ -982,7 +981,7 @@ class AdditionalColumns: @classmethod def options(cls) -> List[str]: - # TODO + # TODO Django dependent """ Returns @@ -993,7 +992,7 @@ def options(cls) -> List[str]: @property def label(self) -> str: - # TODO + # TODO Django dependent """ Returns @@ -1003,7 +1002,7 @@ def label(self) -> str: return "scores" def _validate_columns(self) -> "MaveDataset": - # TODO + # TODO Django dependent """ Returns @@ -1026,7 +1025,7 @@ def _validate_columns(self) -> "MaveDataset": return self def _normalize_data(self) -> "MaveDataset": - # TODO + # TODO Django dependent """ Returns @@ -1052,13 +1051,13 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): - # TODO + # TODO Django dependent """ """ @property def label(self) -> str: - # TODO + # TODO Django dependent """ Returns From 30392c17a0b5f54814794ac6aff93190e5e3189b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:28:52 -0700 Subject: [PATCH 274/877] mark file as needing to be refactored --- mavecore/validation/variant_validators/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 93b413f..385a9c2 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -1,3 +1,4 @@ +# TODO Django dependent, whole file will need to be refactored import re from collections import defaultdict from io import StringIO From 589fc32c0a86f04bb51b9d399aa25976390a64d3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:29:12 -0700 Subject: [PATCH 275/877] import function from utilities.py --- .../validation/variant_validators/dataset.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 385a9c2..36fe627 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -22,22 +22,7 @@ readable_null_values ) - -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : - - Returns - _______ - bool - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value +from mavecore.validation.utilities import is_null class MaveDataset: From dfdc795d4c9da18b4d6cb55b3f00dfd2e5e980b7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:29:29 -0700 Subject: [PATCH 276/877] mark as TODO --- mavecore/validation/dataset_validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 5a13192..a06e0a0 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -29,7 +29,6 @@ class WordLimitValidator: counter = re.compile(r"\w+\b", flags=re.IGNORECASE) def __init__(self, word_limit, message=None, code=None): - # TODO # check the code parameter type """ This constructor sets the values of the WordLimitValidator class attributes From 535594ec56b4644eeffa6dd0e2dbaa7e041f7102 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:29:56 -0700 Subject: [PATCH 277/877] mark whole file as needing to be refactored --- mavecore/validation/genome_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index d506a6d..f2234a8 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -1,3 +1,4 @@ +# TODO Django dependent, Django forms, whole file needs to be refactored """ Validator functions for the fields of the following classes: WildTypeSequence From c7c78a59c40bc98f0b04b734d2afea88f392cc89 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:30:02 -0700 Subject: [PATCH 278/877] mark whole file as needing to be refactored --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 41e4079..cb0e450 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,3 +1,4 @@ +# TODO Django dependent, Django forms, whole file needs to be refactored import idutils from mavecore.validation.exceptions import ValidationError From 4e5a5a80e151828fb6f46d4ed42114bc63ca37c7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:30:35 -0700 Subject: [PATCH 279/877] mark function as needing to be refactored --- mavecore/validation/urn_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index f8f6d3a..341dba6 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -56,6 +56,7 @@ def validate_mavedb_urn(urn): + # TODO, currently not functioning in MaveDB """ This function validates a MaveDB urn and raises an error if it is not valid. From a9194d698e71765785dfcec8875a29ae72f68e52 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:37:20 -0700 Subject: [PATCH 280/877] add TODO for file extension validators that need to be reimplemented --- mavecore/validation/dataset_validators.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index a06e0a0..15ced6f 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -364,3 +364,8 @@ def validate_scoreset_json(dict_): if len(extras) > 0: extras = [k for k in dict_.keys() if k not in required_columns] raise ValidationError("Encountered unexpected keys extras") + +# TODO reimplement file extension validators from MaveDB +# validate_csv_extension +# validate_gz_extension +# validate_json_extension \ No newline at end of file From 30438d3a0c870ffe5a40e4c8bda6cf9f81e272b8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 1 Apr 2022 16:43:09 -0700 Subject: [PATCH 281/877] add general validators --- mavecore/validation/general_validators.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mavecore/validation/general_validators.py diff --git a/mavecore/validation/general_validators.py b/mavecore/validation/general_validators.py new file mode 100644 index 0000000..e69de29 From cf951987ca2f154a938ecec0b52669ffaf4121e6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 1 Apr 2022 17:39:04 -0700 Subject: [PATCH 282/877] replicate Django validation error --- mavecore/validation/exceptions.py | 147 +++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 2851fa7..22d239e 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,2 +1,145 @@ -class ValidationError(ValueError): - pass +# note: validation error code in this file is from Django +import operator + +NON_FIELD_ERRORS = "__all__" + + +class ValidationError(Exception): + """An error while validating data.""" + + def __init__(self, message, code=None, params=None): + """ + The `message` argument can be a single error, a list of errors, or a + dictionary that maps field names to lists of errors. What we define as + an "error" can be either a simple string or an instance of + ValidationError with its message attribute set, and what we define as + list or dictionary can be an actual `list` or `dict` or an instance + of ValidationError with its `error_list` or `error_dict` attribute set. + """ + super().__init__(message, code, params) + + if isinstance(message, ValidationError): + if hasattr(message, "error_dict"): + message = message.error_dict + elif not hasattr(message, "message"): + message = message.error_list + else: + message, code, params = message.message, message.code, message.params + + if isinstance(message, dict): + self.error_dict = {} + for field, messages in message.items(): + if not isinstance(messages, ValidationError): + messages = ValidationError(messages) + self.error_dict[field] = messages.error_list + + elif isinstance(message, list): + self.error_list = [] + for message in message: + # Normalize plain strings to instances of ValidationError. + if not isinstance(message, ValidationError): + message = ValidationError(message) + if hasattr(message, "error_dict"): + self.error_list.extend(sum(message.error_dict.values(), [])) + else: + self.error_list.extend(message.error_list) + + else: + self.message = message + self.code = code + self.params = params + self.error_list = [self] + + @property + def message_dict(self): + # Trigger an AttributeError if this ValidationError + # doesn't have an error_dict. + getattr(self, "error_dict") + + return dict(self) + + @property + def messages(self): + if hasattr(self, "error_dict"): + return sum(dict(self).values(), []) + return list(self) + + def update_error_dict(self, error_dict): + if hasattr(self, "error_dict"): + for field, error_list in self.error_dict.items(): + error_dict.setdefault(field, []).extend(error_list) + else: + error_dict.setdefault(NON_FIELD_ERRORS, []).extend(self.error_list) + return error_dict + + def __iter__(self): + if hasattr(self, "error_dict"): + for field, errors in self.error_dict.items(): + yield field, list(ValidationError(errors)) + else: + for error in self.error_list: + message = error.message + if error.params: + message %= error.params + yield str(message) + + def __str__(self): + if hasattr(self, "error_dict"): + return repr(dict(self)) + return repr(list(self)) + + def __repr__(self): + return "ValidationError(%s)" % self + + def __eq__(self, other): + if not isinstance(other, ValidationError): + return NotImplemented + return hash(self) == hash(other) + + def __hash__(self): + if hasattr(self, "message"): + return hash( + ( + self.message, + self.code, + make_hashable(self.params), + ) + ) + if hasattr(self, "error_dict"): + return hash(make_hashable(self.error_dict)) + return hash(tuple(sorted(self.error_list, key=operator.attrgetter("message")))) + + +def make_hashable(value): + """ + Attempt to make value hashable or raise a TypeError if it fails. + + The returned value should generate the same hash for equal values. + """ + if isinstance(value, dict): + return tuple([ + (key, make_hashable(nested_value)) + for key, nested_value in sorted(value.items()) + ]) + # Try hash to avoid converting a hashable iterable (e.g. string, frozenset) + # to a tuple. + try: + hash(value) + except TypeError: + if is_iterable(value): + return tuple(map(make_hashable, value)) + # Non-hashable, non-iterable. + raise + return value + + +def is_iterable(x): + """ + An implementation independent way of checking for iterables + """ + try: + iter(x) + except TypeError: + return False + else: + return True \ No newline at end of file From 63df5d74c1b1cd1734e165f6eaa24ce533749fe8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 1 Apr 2022 17:43:26 -0700 Subject: [PATCH 283/877] replicate Django file extension validator --- mavecore/validation/general_validators.py | 73 +++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/mavecore/validation/general_validators.py b/mavecore/validation/general_validators.py index e69de29..e2962cf 100644 --- a/mavecore/validation/general_validators.py +++ b/mavecore/validation/general_validators.py @@ -0,0 +1,73 @@ +# note: FileExtensionValidator is from Django +from pathlib import Path +from mavecore.validation.exceptions import ValidationError +# validate_csv_extension +# validate_gz_extension +# validate_json_extension + +validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) +validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) +validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) + +class FileExtensionValidator: + # TODO, may need to edit validation error, will try to replicate Django error first + """ + This class validates file extensions and will replace the Django validator of + the same name. + + From Django: + Raises a ValidationError with a code of 'invalid_extension' if the extension of + value.name (value is a File) isn’t found in allowed_extensions. The extension is + compared case-insensitively with allowed_extensions. + """ + message = _("File extension “%(extension)s” is not allowed. " + "Allowed extensions are: %(allowed_extensions)s." + ) + code = "invalid_extension" + + def __init__(self, allowed_extensions=None, message=None, code=None): + """ + This constructor sets the values of the FileExtensionValidator. + + Parameters + __________ + allowed_extensions : List[str] + A list of allowed file extensions. + message : str + (default = None) The message assigned to the message attribute. + code : + (default = None) The code assigned to the code attribute. + """ + if allowed_extensions is not None: + allowed_extensions = [ + allowed_extension.lower() for allowed_extension in allowed_extensions + ] + self.allowed_extensions = allowed_extensions + if message is not None: + self.message = message + if code is not None: + self.code = code + + def __call__(self, value): + extension = Path(value.name).suffix[1:].lower() + if ( + self.allowed_extensions is not None + and extension not in self.allowed_extensions + ): + raise ValidationError( + self.message, + code=self.code, + params={ + "extension": extension, + "allowed_extensions": ", ".join(self.allowed_extensions), + "value": value, + }, + ) + + def __eq__(self, other): + return ( + isinstance(other, self.__class__) + and self.allowed_extensions == other.allowed_extensions + and self.message == other.message + and self.code == other.code + ) \ No newline at end of file From c549b7ab0e7f1f40a34e9bf75cccea6ded68199c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 1 Apr 2022 17:43:53 -0700 Subject: [PATCH 284/877] note validators that need to be reimplemented --- mavecore/validation/dataset_validators.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 15ced6f..0a5c506 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -368,4 +368,8 @@ def validate_scoreset_json(dict_): # TODO reimplement file extension validators from MaveDB # validate_csv_extension # validate_gz_extension -# validate_json_extension \ No newline at end of file +# validate_json_extension + +#validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) +#validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) +#validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) From 6a21dac15ff901551e64129b6774c4913659821b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Sat, 2 Apr 2022 16:34:04 -0700 Subject: [PATCH 285/877] reformat --- mavecore/validation/dataset_validators.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 0a5c506..61f62e2 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,6 +9,16 @@ from mavecore.validation.utilities import is_null +from mavecore.validation.general_validators import FileExtensionValidator + +# TODO reimplement file extension validators from MaveDB +# validate_csv_extension +# validate_gz_extension +# validate_json_extension + +validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) +validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) +validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) class WordLimitValidator: """ From 4b8dc24388105b9091d14b94791424b6dbac072e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Sat, 2 Apr 2022 16:34:11 -0700 Subject: [PATCH 286/877] reformat --- mavecore/validation/dataset_validators.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 61f62e2..dd875c3 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -375,11 +375,4 @@ def validate_scoreset_json(dict_): extras = [k for k in dict_.keys() if k not in required_columns] raise ValidationError("Encountered unexpected keys extras") -# TODO reimplement file extension validators from MaveDB -# validate_csv_extension -# validate_gz_extension -# validate_json_extension -#validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -#validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -#validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) From 7678a914c7f183987ca761c2bcde64143cb54373 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Sat, 2 Apr 2022 16:34:32 -0700 Subject: [PATCH 287/877] add general ValidationError --- mavecore/validation/exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 22d239e..666463d 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -5,6 +5,10 @@ class ValidationError(Exception): + None + + +class ValidationError2(Exception): """An error while validating data.""" def __init__(self, message, code=None, params=None): From bba2820edbfb63b046a9e20f425f5a479a975bd8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Sun, 3 Apr 2022 21:16:49 -0700 Subject: [PATCH 288/877] clarify comment --- mavecore/validation/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 666463d..466c2fa 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,4 +1,4 @@ -# note: validation error code in this file is from Django +# note: ValidationError2 code in this file is from Django import operator NON_FIELD_ERRORS = "__all__" From ae0460c8ff66ebb4a7811ed5cb755d49a605e501 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Mon, 4 Apr 2022 14:53:31 +1000 Subject: [PATCH 289/877] update MaveCore version. --- setup.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 335703f..b972138 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,16 @@ with open("README.md", "r") as fh: long_description = fh.read() -requirements = ["fqfa>=1.2.1", - "mavehgvs>=0.4.0", - "idutils>=1.1.0", - "pandas>=1.1.0", - ] +requirements = [ + "fqfa>=1.2.1", + "mavehgvs>=0.4.0", + "idutils>=1.1.0", + "pandas>=1.1.0", +] setuptools.setup( name="mavecore", - version="0.1.0", + version="0.1.3", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", description=( @@ -32,4 +33,4 @@ python_requires=">=3.6", install_requires=requirements, test_suite="tests", -) \ No newline at end of file +) From 9004e3c0f9550d94a32616a4d9d2103f33cd51ae Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Mon, 4 Apr 2022 15:33:36 +1000 Subject: [PATCH 290/877] Upload new MaveCore version to PyPI. --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 29f92c2..286779f 100644 --- a/setup.py +++ b/setup.py @@ -12,13 +12,15 @@ setuptools.setup( name="mavecore", - version="0.1.3", + version="0.1.4", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", - description=("MaveCore implements shared functionality for MaveTools and MaveDB."), + description=( + "MaveCore implements shared functionality for MaveTools and MaveDB." + ), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/", + url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", packages=setuptools.find_packages(), classifiers=[ "Development Status :: 2 - Pre-Alpha", From cc38224c40285c873f2eb9e32cc8183fd0449e9a Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 15:44:33 +1000 Subject: [PATCH 291/877] delete old requirements --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 5efa4f9..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy~=1.22.2 -setuptools~=60.9.3 \ No newline at end of file From ea9050f373b5cdad5db2ee7be4738c273c1b0d7a Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 15:46:29 +1000 Subject: [PATCH 292/877] update to latest black version --- .pre-commit-config.yaml | 2 +- requirements-dev.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 requirements-dev.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa9503f..8f4bf35 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ repos: - id: trailing-whitespace files: \.py$ - repo: https://github.com/psf/black - rev: 21.5b1 + rev: 22.3.0 hooks: - id: black language_version: python3 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..416634f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pre-commit From a596d9cdf4360b6d70f7208632651260a85d43fb Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 15:49:45 +1000 Subject: [PATCH 293/877] fix broken docstrings --- .../validation/variant_validators/dataset.py | 50 +++++++++++++------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index d6c7288..088e5c8 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -26,9 +26,6 @@ class MaveDataset: - # TODO Django dependent - """ - class DatasetType: # TODO """ """ @@ -54,8 +51,6 @@ def options(cls) -> List[str]: return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] class AdditionalColumns: - # TODO Django dependent - """ @classmethod def options(cls) -> List[str]: # TODO Django dependent @@ -69,7 +64,9 @@ def options(cls) -> List[str]: # ---------------------- Construction------------------------------------ # @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": + def for_scores( + cls, file: Union[str, TextIO, BinaryIO] + ) -> "MaveScoresDataset": # TODO Django dependent """ @@ -85,7 +82,9 @@ def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": + def for_counts( + cls, file: Union[str, TextIO, BinaryIO] + ) -> "MaveCountsDataset": # TODO Django dependent """ @@ -132,7 +131,8 @@ def _for_type( handle = StringIO(file_contents) else: raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" + f"Expected file path or buffer object. " + f"Got '{type(file).__name__}'" ) extra_na_values = set( @@ -161,7 +161,9 @@ def _for_type( elif dataset_type == cls.DatasetType.COUNTS: return MaveCountsDataset(df) else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") + raise ValueError( + f"'{dataset_type}' is not a recognised dataset type." + ) # ---------------------- Public ----------------------------------------- # @property @@ -322,7 +324,9 @@ def data(self, serializable=False) -> pd.DataFrame: if serializable: # need to force "object" type to allow None values return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) + return_df.where( + cond=pd.notnull(return_df), other=None, inplace=True + ) return return_df return self._df.copy(deep=True) @@ -406,7 +410,9 @@ def validate( ._validate_genomic_variants(targetseq, relaxed_ordering) ._validate_transcript_variants(targetseq, relaxed_ordering) ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) + ._validate_index_column( + allow_duplicates=allow_index_duplicates + ) ) if self.is_empty: @@ -481,7 +487,9 @@ def _column_order(self) -> Dict[str, int]: self.HGVSColumns.PROTEIN: 2, **{ c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) + for (i, c) in enumerate( + self.AdditionalColumns.options(), start=1 + ) }, }, ) @@ -555,7 +563,9 @@ def _normalize_data(self) -> "MaveDataset": self._df[c] = np.NaN column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) + sorted_columns = list( + sorted(self.columns, key=lambda x: column_order[x]) + ) self._df = self._df[sorted_columns] return self @@ -715,7 +725,9 @@ def _validate_protein_variants( return self - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": + def _validate_index_column( + self, allow_duplicates: bool = False + ) -> "MaveDataset": # TODO Django dependent """ @@ -814,7 +826,9 @@ def validate_variant(variant: str): return variant validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering + variant, + targetseq=targetseq, + relaxed_ordering=relaxed_ordering, ) prefix = validated.prefix.lower() prefixes.add(prefix) @@ -949,6 +963,7 @@ def _validate_variant_prefix_for_column( class MaveScoresDataset(MaveDataset): # TODO """ """ + class AdditionalColumns: # TODO """ """ @@ -1018,7 +1033,9 @@ def _normalize_data(self) -> "MaveDataset": for c in should_be_numeric: if c in self.columns: try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") + self._df[c] = self._df[c].astype( + dtype=float, errors="raise" + ) except ValueError as e: self._errors.append(f"{c}: {str(e)}") @@ -1028,6 +1045,7 @@ def _normalize_data(self) -> "MaveDataset": class MaveCountsDataset(MaveDataset): # TODO """ """ + @property def label(self) -> str: # TODO Django dependent From d5cd92fe3d2cbb88317a963d7bdd9acf23d62041 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 16:07:16 +1000 Subject: [PATCH 294/877] remove reimplementation of Django FileExtensionValidator --- mavecore/validation/dataset_validators.py | 6 +- mavecore/validation/general_validators.py | 73 ----------------------- 2 files changed, 1 insertion(+), 78 deletions(-) delete mode 100644 mavecore/validation/general_validators.py diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index e675251..fd59f2d 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,6 +9,7 @@ from mavecore.validation.utilities import is_null + def is_null(value): """ Checks if a stripped/lowercase value is one of the recognized NA or NULL string values. @@ -27,9 +28,6 @@ def is_null(value): value = str(value).strip().lower() return constants.null_values_re.fullmatch(value) or not value -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) class WordLimitValidator: """ @@ -385,5 +383,3 @@ def validate_scoreset_json(dict_): if len(extras) > 0: extras = [k for k in dict_.keys() if k not in required_columns] raise ValidationError("Encountered unexpected keys extras") - - diff --git a/mavecore/validation/general_validators.py b/mavecore/validation/general_validators.py deleted file mode 100644 index e2962cf..0000000 --- a/mavecore/validation/general_validators.py +++ /dev/null @@ -1,73 +0,0 @@ -# note: FileExtensionValidator is from Django -from pathlib import Path -from mavecore.validation.exceptions import ValidationError -# validate_csv_extension -# validate_gz_extension -# validate_json_extension - -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) - -class FileExtensionValidator: - # TODO, may need to edit validation error, will try to replicate Django error first - """ - This class validates file extensions and will replace the Django validator of - the same name. - - From Django: - Raises a ValidationError with a code of 'invalid_extension' if the extension of - value.name (value is a File) isn’t found in allowed_extensions. The extension is - compared case-insensitively with allowed_extensions. - """ - message = _("File extension “%(extension)s” is not allowed. " - "Allowed extensions are: %(allowed_extensions)s." - ) - code = "invalid_extension" - - def __init__(self, allowed_extensions=None, message=None, code=None): - """ - This constructor sets the values of the FileExtensionValidator. - - Parameters - __________ - allowed_extensions : List[str] - A list of allowed file extensions. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if allowed_extensions is not None: - allowed_extensions = [ - allowed_extension.lower() for allowed_extension in allowed_extensions - ] - self.allowed_extensions = allowed_extensions - if message is not None: - self.message = message - if code is not None: - self.code = code - - def __call__(self, value): - extension = Path(value.name).suffix[1:].lower() - if ( - self.allowed_extensions is not None - and extension not in self.allowed_extensions - ): - raise ValidationError( - self.message, - code=self.code, - params={ - "extension": extension, - "allowed_extensions": ", ".join(self.allowed_extensions), - "value": value, - }, - ) - - def __eq__(self, other): - return ( - isinstance(other, self.__class__) - and self.allowed_extensions == other.allowed_extensions - and self.message == other.message - and self.code == other.code - ) \ No newline at end of file From 0b82171da603cdcd7451007004dca6bcd71aa2f9 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 16:08:05 +1000 Subject: [PATCH 295/877] update tests to expect ValidationError instead of ValueError --- .../test_dataset_validators.py | 108 ++++++++++++------ 1 file changed, 71 insertions(+), 37 deletions(-) diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation/test_dataset_validators.py index 6b4895a..dfed763 100644 --- a/tests/test_validation/test_dataset_validators.py +++ b/tests/test_validation/test_dataset_validators.py @@ -18,10 +18,12 @@ WordLimitValidator, ) +from mavecore.validation.exceptions import ValidationError + class TestWordLimitValidator(TestCase): def test_validation_error_more_than_word_limit(self): - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): n = 5 WordLimitValidator(n)("Word " * (n + 1)) @@ -42,14 +44,18 @@ class TestHeaderFromIO(TestCase): """ def test_can_read_header_from_bytes(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_removes_quotes_from_header(self): file = BytesIO( - '"{}","score","count,nt"\n'.format(constants.hgvs_nt_column).encode() + '"{}","score","count,nt"\n'.format( + constants.hgvs_nt_column + ).encode() ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count,nt"] @@ -62,16 +68,21 @@ def test_can_read_header_from_string(self): self.assertEqual(expected, header) def test_strips_whitespace(self): - file = StringIO(" {} , score , count\n".format(constants.hgvs_nt_column)) + file = StringIO( + " {} , score , count\n".format(constants.hgvs_nt_column) + ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_returns_file_position_to_begining(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) read_header_from_io(file) self.assertEqual( - file.read(), "{},score,count\n".format(constants.hgvs_nt_column).encode() + file.read(), + "{},score,count\n".format(constants.hgvs_nt_column).encode(), ) @@ -84,9 +95,11 @@ class TestNoNullInColumnsValidator(TestCase): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): header = read_header_from_io(file) validate_header_contains_no_null_columns(header) @@ -105,12 +118,14 @@ class TestAtLeastOneNumericColumnValidator(TestCase): def test_raises_valuerror_when_less_than_2_values_in_column(self): file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): header = read_header_from_io(file) validate_at_least_one_additional_column(header) def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) validate_at_least_one_additional_column(header) # Should pass @@ -126,24 +141,30 @@ class TestHgvsInHeaderValidator(TestCase): def test_raises_valuerror_when_neither_hgvs_col_in_column(self): file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): header = read_header_from_io(file) validate_has_hgvs_in_header(header) def test_hgvs_must_be_lowercase(self): file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() + "{},score,count\n".format( + constants.hgvs_nt_column.upper() + ).encode() ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): header = read_header_from_io(file) validate_has_hgvs_in_header(header) def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column).encode() + ) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass - file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) + file = BytesIO( + "{},score,count\n".format(constants.hgvs_pro_column).encode() + ) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass @@ -169,7 +190,7 @@ def test_ve_counts_defines_different_nt_variants(self): constants.hgvs_splice_column: [None], } ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_datasets_define_same_variants(scores, counts) def test_ve_counts_defines_different_splice_variants(self): @@ -187,7 +208,7 @@ def test_ve_counts_defines_different_splice_variants(self): constants.hgvs_pro_column: [None], } ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_datasets_define_same_variants(scores, counts) def test_ve_counts_defines_different_pro_variants(self): @@ -205,7 +226,7 @@ def test_ve_counts_defines_different_pro_variants(self): constants.hgvs_pro_column: ["p.Leu75Glu"], } ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_datasets_define_same_variants(scores, counts) def test_passes_when_same_variants_defined(self): @@ -234,20 +255,22 @@ class TestValidateScoreSetCountDataInputValidator(TestCase): def test_raises_valuerror_when_hgvs_not_in_column(self): file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_count_data_input(file) def test_raises_valuerror_no_numeric_column(self): file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_count_data_input(file) def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_count_data_input(file) @@ -259,25 +282,27 @@ class TestValidateScoreSetScoreDataInputValidator(TestCase): def test_raises_valuerror_when_hgvs_not_in_column(self): file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_score_data_input(file) def test_raises_valuerror_no_numeric_column(self): file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_score_data_input(file) def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + "{},score,{}\n".format( + constants.hgvs_nt_column, value + ).encode() ) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_score_data_input(file) def test_validatation_error_score_not_in_header(self): file = BytesIO("{},count\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_score_data_input(file) @@ -292,37 +317,46 @@ def test_valueerror_unexptected_columns(self): constants.score_columns: ["score"], constants.count_columns: [], } - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_json(field) def test_valueerror_values_not_lists(self): - field = {constants.score_columns: ["score"], constants.count_columns: {}} - with self.assertRaises(ValueError): + field = { + constants.score_columns: ["score"], + constants.count_columns: {}, + } + with self.assertRaises(ValidationError): validate_scoreset_json(field) def test_valueerror_list_values_not_strings(self): - field = {constants.score_columns: [b"score"], constants.count_columns: []} - with self.assertRaises(ValueError): + field = { + constants.score_columns: [b"score"], + constants.count_columns: [], + } + with self.assertRaises(ValidationError): validate_scoreset_json(field) def test_valueerror_empty_score_columns(self): field = {constants.score_columns: [], constants.count_columns: []} - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_json(field) def test_valueerror_missing_dict_columns(self): # constants.score_columns missing field = {constants.count_columns: []} - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_json(field) # constants.count_columns missing field = {constants.score_columns: ["score"]} - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): validate_scoreset_json(field) def test_valueerror_missing_header_columns(self): # constants.score_columns columns missing 'score' - field = {constants.score_columns: ["hgvs"], constants.count_columns: []} - with self.assertRaises(ValueError): + field = { + constants.score_columns: ["hgvs"], + constants.count_columns: [], + } + with self.assertRaises(ValidationError): validate_scoreset_json(field) From 4262ca11160d00b8ad50ecd8c6f82d4b4282b1c4 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 16:13:49 +1000 Subject: [PATCH 296/877] minimal dev docs --- README.md | 15 +++++++++++++++ requirements-dev.txt | 1 + 2 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 738a35b..7d984d5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,17 @@ # MaveCore Shared MaveDB and MaveTools functionality + +## Contributing + +To contribute to MaveCore development, please install the additional requirements: +``` +pip install -r requirements-dev.txt +``` + +To run the tests and generate an HTML coverage report use: +``` +coverage run -m unittest && coverage html +``` + +By default, the coverage report will be located at `htmlcov/index.html`. +Open this file in your browser to identify lines that have not been adequately covered by the test suite. diff --git a/requirements-dev.txt b/requirements-dev.txt index 416634f..d535553 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,2 @@ pre-commit +coverage From 9fb656f37728084b3291d570655addaf95676122 Mon Sep 17 00:00:00 2001 From: Alan Rubin Date: Mon, 4 Apr 2022 16:15:13 +1000 Subject: [PATCH 297/877] black formatting --- .pre-commit-config.yaml | 1 - mavecore/validation/dataset_validators.py | 9 +--- mavecore/validation/exceptions.py | 18 +++++--- mavecore/validation/urn_validators.py | 4 +- mavecore/validation/utilities.py | 3 +- .../validation/variant_validators/dataset.py | 39 +++++----------- .../validation/variant_validators/hgvs.py | 5 +-- setup.py | 4 +- .../test_dataset_validators.py | 44 +++++-------------- 9 files changed, 41 insertions(+), 86 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8f4bf35..e892c22 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,4 +16,3 @@ repos: hooks: - id: black language_version: python3 - args: [--line-length=79] diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index fd59f2d..b87a1eb 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -352,9 +352,7 @@ def validate_scoreset_json(dict_): for key in required_columns: if key not in dict_.keys(): - raise ValidationError( - "Scoreset data is missing the required key " + key - ) + raise ValidationError("Scoreset data is missing the required key " + key) columns = dict_[key] if not all([isinstance(c, str) for c in columns]): @@ -363,10 +361,7 @@ def validate_scoreset_json(dict_): if not isinstance(columns, list): type_ = type(columns).__name__ raise ValidationError( - "Value for " - + key.replace("_", " ") - + " must be a list not " - + type_ + "Value for " + key.replace("_", " ") + " must be a list not " + type_ ) # Check score columns is not-empty and at least contains hgvs and score diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 466c2fa..27ff6e8 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -28,7 +28,11 @@ def __init__(self, message, code=None, params=None): elif not hasattr(message, "message"): message = message.error_list else: - message, code, params = message.message, message.code, message.params + message, code, params = ( + message.message, + message.code, + message.params, + ) if isinstance(message, dict): self.error_dict = {} @@ -121,10 +125,12 @@ def make_hashable(value): The returned value should generate the same hash for equal values. """ if isinstance(value, dict): - return tuple([ - (key, make_hashable(nested_value)) - for key, nested_value in sorted(value.items()) - ]) + return tuple( + [ + (key, make_hashable(nested_value)) + for key, nested_value in sorted(value.items()) + ] + ) # Try hash to avoid converting a hashable iterable (e.g. string, frozenset) # to a tuple. try: @@ -146,4 +152,4 @@ def is_iterable(x): except TypeError: return False else: - return True \ No newline at end of file + return True diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation/urn_validators.py index 341dba6..9495e34 100644 --- a/mavecore/validation/urn_validators.py +++ b/mavecore/validation/urn_validators.py @@ -110,9 +110,7 @@ def validate_mavedb_urn_experiment(urn): If the Experiemnt urn is not valid. """ if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "{}'s is not a valid Experiment urn.".format(urn) - ) + raise ValidationError("{}'s is not a valid Experiment urn.".format(urn)) def validate_mavedb_urn_scoreset(urn): diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 4ea0480..d9bd645 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -1,5 +1,6 @@ from mavecore.validation.constants import null_values_re + def is_null(value): """ Returns True if a stripped/lowercase value in in `nan_col_values`. @@ -15,4 +16,4 @@ def is_null(value): True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. """ value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value \ No newline at end of file + return null_values_re.fullmatch(value) or not value diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index 088e5c8..1fa36dd 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -64,9 +64,7 @@ def options(cls) -> List[str]: # ---------------------- Construction------------------------------------ # @classmethod - def for_scores( - cls, file: Union[str, TextIO, BinaryIO] - ) -> "MaveScoresDataset": + def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": # TODO Django dependent """ @@ -82,9 +80,7 @@ def for_scores( return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) @classmethod - def for_counts( - cls, file: Union[str, TextIO, BinaryIO] - ) -> "MaveCountsDataset": + def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": # TODO Django dependent """ @@ -131,8 +127,7 @@ def _for_type( handle = StringIO(file_contents) else: raise TypeError( - f"Expected file path or buffer object. " - f"Got '{type(file).__name__}'" + f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" ) extra_na_values = set( @@ -161,9 +156,7 @@ def _for_type( elif dataset_type == cls.DatasetType.COUNTS: return MaveCountsDataset(df) else: - raise ValueError( - f"'{dataset_type}' is not a recognised dataset type." - ) + raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") # ---------------------- Public ----------------------------------------- # @property @@ -324,9 +317,7 @@ def data(self, serializable=False) -> pd.DataFrame: if serializable: # need to force "object" type to allow None values return_df = self._df.astype(object, copy=True) - return_df.where( - cond=pd.notnull(return_df), other=None, inplace=True - ) + return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) return return_df return self._df.copy(deep=True) @@ -410,9 +401,7 @@ def validate( ._validate_genomic_variants(targetseq, relaxed_ordering) ._validate_transcript_variants(targetseq, relaxed_ordering) ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column( - allow_duplicates=allow_index_duplicates - ) + ._validate_index_column(allow_duplicates=allow_index_duplicates) ) if self.is_empty: @@ -487,9 +476,7 @@ def _column_order(self) -> Dict[str, int]: self.HGVSColumns.PROTEIN: 2, **{ c: (2 + i) - for (i, c) in enumerate( - self.AdditionalColumns.options(), start=1 - ) + for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) }, }, ) @@ -563,9 +550,7 @@ def _normalize_data(self) -> "MaveDataset": self._df[c] = np.NaN column_order = self._column_order - sorted_columns = list( - sorted(self.columns, key=lambda x: column_order[x]) - ) + sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) self._df = self._df[sorted_columns] return self @@ -725,9 +710,7 @@ def _validate_protein_variants( return self - def _validate_index_column( - self, allow_duplicates: bool = False - ) -> "MaveDataset": + def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": # TODO Django dependent """ @@ -1033,9 +1016,7 @@ def _normalize_data(self) -> "MaveDataset": for c in should_be_numeric: if c in self.columns: try: - self._df[c] = self._df[c].astype( - dtype=float, errors="raise" - ) + self._df[c] = self._df[c].astype(dtype=float, errors="raise") except ValueError as e: self._errors.append(f"{c}: {str(e)}") diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 4ef39e3..6caa049 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -14,6 +14,7 @@ from mavecore.validation.utilities import is_null + def validate_hgvs_string( value: Union[str, bytes], column: Optional[str] = None, @@ -108,9 +109,7 @@ def validate_hgvs_string( f"protein variant prefix is 'p.'." ) else: - raise ValueError( - "Unknown column '{}'. Expected nt, splice or p".format(column) - ) + raise ValueError("Unknown column '{}'. Expected nt, splice or p".format(column)) return str(variant) diff --git a/setup.py b/setup.py index 286779f..18adfd0 100644 --- a/setup.py +++ b/setup.py @@ -15,9 +15,7 @@ version="0.1.4", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", - description=( - "MaveCore implements shared functionality for MaveTools and MaveDB." - ), + description=("MaveCore implements shared functionality for MaveTools and MaveDB."), long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation/test_dataset_validators.py index dfed763..8dbd4a2 100644 --- a/tests/test_validation/test_dataset_validators.py +++ b/tests/test_validation/test_dataset_validators.py @@ -44,18 +44,14 @@ class TestHeaderFromIO(TestCase): """ def test_can_read_header_from_bytes(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) + file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_removes_quotes_from_header(self): file = BytesIO( - '"{}","score","count,nt"\n'.format( - constants.hgvs_nt_column - ).encode() + '"{}","score","count,nt"\n'.format(constants.hgvs_nt_column).encode() ) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count,nt"] @@ -68,17 +64,13 @@ def test_can_read_header_from_string(self): self.assertEqual(expected, header) def test_strips_whitespace(self): - file = StringIO( - " {} , score , count\n".format(constants.hgvs_nt_column) - ) + file = StringIO(" {} , score , count\n".format(constants.hgvs_nt_column)) header = read_header_from_io(file) expected = [constants.hgvs_nt_column, "score", "count"] self.assertEqual(expected, header) def test_returns_file_position_to_begining(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) + file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) read_header_from_io(file) self.assertEqual( file.read(), @@ -95,9 +87,7 @@ class TestNoNullInColumnsValidator(TestCase): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format( - constants.hgvs_nt_column, value - ).encode() + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() ) with self.assertRaises(ValidationError): header = read_header_from_io(file) @@ -123,9 +113,7 @@ def test_raises_valuerror_when_less_than_2_values_in_column(self): validate_at_least_one_additional_column(header) def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) + file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) header = read_header_from_io(file) validate_at_least_one_additional_column(header) # Should pass @@ -147,24 +135,18 @@ def test_raises_valuerror_when_neither_hgvs_col_in_column(self): def test_hgvs_must_be_lowercase(self): file = BytesIO( - "{},score,count\n".format( - constants.hgvs_nt_column.upper() - ).encode() + "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() ) with self.assertRaises(ValidationError): header = read_header_from_io(file) validate_has_hgvs_in_header(header) def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) + file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass - file = BytesIO( - "{},score,count\n".format(constants.hgvs_pro_column).encode() - ) + file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) header = read_header_from_io(file) validate_has_hgvs_in_header(header) # Should pass @@ -266,9 +248,7 @@ def test_raises_valuerror_no_numeric_column(self): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format( - constants.hgvs_nt_column, value - ).encode() + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() ) with self.assertRaises(ValidationError): validate_scoreset_count_data_input(file) @@ -293,9 +273,7 @@ def test_raises_valuerror_no_numeric_column(self): def test_raises_valuerror_when_null_values_in_column(self): for value in constants.null_values_list: file = BytesIO( - "{},score,{}\n".format( - constants.hgvs_nt_column, value - ).encode() + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() ) with self.assertRaises(ValidationError): validate_scoreset_score_data_input(file) From f91101ffecf0a602d7ab587bf7abb8cb18361f14 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Mon, 4 Apr 2022 16:20:32 +1000 Subject: [PATCH 298/877] Upload new MaveCore version 0.1.5 to PyPI. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 18adfd0..2187562 100644 --- a/setup.py +++ b/setup.py @@ -12,13 +12,13 @@ setuptools.setup( name="mavecore", - version="0.1.4", + version="0.1.5", author="MaveDB Developers", author_email="alan.rubin@wehi.edu.au", description=("MaveCore implements shared functionality for MaveTools and MaveDB."), long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/VariantEffect/MaveCore/tree/testMaveCore", + url="https://github.com/VariantEffect/MaveCore/tree/release/0.1", packages=setuptools.find_packages(), classifiers=[ "Development Status :: 2 - Pre-Alpha", From e0466cec2bc1b9795a64179cb9189e1cfcc9a798 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:52:54 -0700 Subject: [PATCH 299/877] change error type --- mavecore/validation/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 466c2fa..bfb2209 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -4,7 +4,7 @@ NON_FIELD_ERRORS = "__all__" -class ValidationError(Exception): +class ValidationError(ValueError): None From 2e4986ec6cd7725c4d9aa3ef1334e507f2bbffe9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:53:28 -0700 Subject: [PATCH 300/877] comment out unneeded code --- mavecore/validation/exceptions.py | 43 +++++++++++++++++-------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index bfb2209..b8cec3d 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -7,19 +7,21 @@ class ValidationError(ValueError): None - +""" class ValidationError2(Exception): - """An error while validating data.""" - - def __init__(self, message, code=None, params=None): - """ - The `message` argument can be a single error, a list of errors, or a - dictionary that maps field names to lists of errors. What we define as - an "error" can be either a simple string or an instance of - ValidationError with its message attribute set, and what we define as - list or dictionary can be an actual `list` or `dict` or an instance - of ValidationError with its `error_list` or `error_dict` attribute set. - """ + + #An error while validating data. + + + #def __init__(self, message, code=None, params=None): + + #The `message` argument can be a single error, a list of errors, or a + #dictionary that maps field names to lists of errors. What we define as + #an "error" can be either a simple string or an instance of + #ValidationError with its message attribute set, and what we define as + #list or dictionary can be an actual `list` or `dict` or an instance + #of ValidationError with its `error_list` or `error_dict` attribute set. + super().__init__(message, code, params) if isinstance(message, ValidationError): @@ -115,11 +117,11 @@ def __hash__(self): def make_hashable(value): - """ - Attempt to make value hashable or raise a TypeError if it fails. - The returned value should generate the same hash for equal values. - """ + #Attempt to make value hashable or raise a TypeError if it fails. + + #The returned value should generate the same hash for equal values. + if isinstance(value, dict): return tuple([ (key, make_hashable(nested_value)) @@ -138,12 +140,13 @@ def make_hashable(value): def is_iterable(x): - """ - An implementation independent way of checking for iterables - """ + + #An implementation independent way of checking for iterables + try: iter(x) except TypeError: return False else: - return True \ No newline at end of file + return True +""" \ No newline at end of file From 14e96aa6060aa10400fbbbd7d9217bd5dc705ffd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:54:39 -0700 Subject: [PATCH 301/877] will not sue FileExtensionValidator --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index dd875c3..7995c16 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,7 +9,7 @@ from mavecore.validation.utilities import is_null -from mavecore.validation.general_validators import FileExtensionValidator +#from mavecore.validation.general_validators import FileExtensionValidator # TODO reimplement file extension validators from MaveDB # validate_csv_extension From f6e204f87b3c5d8581226a3c40fb386627de0964 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:55:33 -0700 Subject: [PATCH 302/877] will not sue FileExtensionValidator --- mavecore/validation/dataset_validators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7995c16..ef3698b 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -15,10 +15,10 @@ # validate_csv_extension # validate_gz_extension # validate_json_extension - -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) +#TODO find another way to validate file extensions +#validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) +#validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) +#validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) class WordLimitValidator: """ From c61d43dd4e1194d33b8f2d585bbde106fe650abc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 11:57:00 -0700 Subject: [PATCH 303/877] will not use FileExtensionValidator, delete file --- mavecore/validation/general_validators.py | 73 ----------------------- 1 file changed, 73 deletions(-) delete mode 100644 mavecore/validation/general_validators.py diff --git a/mavecore/validation/general_validators.py b/mavecore/validation/general_validators.py deleted file mode 100644 index e2962cf..0000000 --- a/mavecore/validation/general_validators.py +++ /dev/null @@ -1,73 +0,0 @@ -# note: FileExtensionValidator is from Django -from pathlib import Path -from mavecore.validation.exceptions import ValidationError -# validate_csv_extension -# validate_gz_extension -# validate_json_extension - -validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) - -class FileExtensionValidator: - # TODO, may need to edit validation error, will try to replicate Django error first - """ - This class validates file extensions and will replace the Django validator of - the same name. - - From Django: - Raises a ValidationError with a code of 'invalid_extension' if the extension of - value.name (value is a File) isn’t found in allowed_extensions. The extension is - compared case-insensitively with allowed_extensions. - """ - message = _("File extension “%(extension)s” is not allowed. " - "Allowed extensions are: %(allowed_extensions)s." - ) - code = "invalid_extension" - - def __init__(self, allowed_extensions=None, message=None, code=None): - """ - This constructor sets the values of the FileExtensionValidator. - - Parameters - __________ - allowed_extensions : List[str] - A list of allowed file extensions. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if allowed_extensions is not None: - allowed_extensions = [ - allowed_extension.lower() for allowed_extension in allowed_extensions - ] - self.allowed_extensions = allowed_extensions - if message is not None: - self.message = message - if code is not None: - self.code = code - - def __call__(self, value): - extension = Path(value.name).suffix[1:].lower() - if ( - self.allowed_extensions is not None - and extension not in self.allowed_extensions - ): - raise ValidationError( - self.message, - code=self.code, - params={ - "extension": extension, - "allowed_extensions": ", ".join(self.allowed_extensions), - "value": value, - }, - ) - - def __eq__(self, other): - return ( - isinstance(other, self.__class__) - and self.allowed_extensions == other.allowed_extensions - and self.message == other.message - and self.code == other.code - ) \ No newline at end of file From 9a94138ddb6d0056da61fed7c9d101b263fdfbcf Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:21:40 -0700 Subject: [PATCH 304/877] remove comments --- mavecore/validation/variant_validators/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py index d6c7288..2d5fa33 100644 --- a/mavecore/validation/variant_validators/dataset.py +++ b/mavecore/validation/variant_validators/dataset.py @@ -27,7 +27,6 @@ class MaveDataset: # TODO Django dependent - """ class DatasetType: # TODO @@ -55,7 +54,6 @@ def options(cls) -> List[str]: class AdditionalColumns: # TODO Django dependent - """ @classmethod def options(cls) -> List[str]: # TODO Django dependent From 835e5000a7c0d01f96f458d92ffeb25979499de0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:22:43 -0700 Subject: [PATCH 305/877] edit variable name --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ef3698b..eafaacf 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -224,7 +224,7 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): "%(label)s file header cannot contain blank/empty/whitespace " "only columns or the following case-insensitive null " "values: {}.".format( - label, ", ".join(constants.readable_null_values) + label, ", ".join(constants.readable_null_values_list) ) ) raise ValidationError(msg) From c120930c29d68decbaecfbace37aac43b5897f3f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:48:48 -0700 Subject: [PATCH 306/877] remove blank line --- mavecore/validation/exceptions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index b8cec3d..bb7abd0 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -3,7 +3,6 @@ NON_FIELD_ERRORS = "__all__" - class ValidationError(ValueError): None From cb378b53041ffc8f0bc56a28ccb2c5ba01d96577 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:49:14 -0700 Subject: [PATCH 307/877] delete ValidationError2 function and helper functions --- mavecore/validation/exceptions.py | 146 +----------------------------- 1 file changed, 1 insertion(+), 145 deletions(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index bb7abd0..91d0df9 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -4,148 +4,4 @@ NON_FIELD_ERRORS = "__all__" class ValidationError(ValueError): - None - -""" -class ValidationError2(Exception): - - #An error while validating data. - - - #def __init__(self, message, code=None, params=None): - - #The `message` argument can be a single error, a list of errors, or a - #dictionary that maps field names to lists of errors. What we define as - #an "error" can be either a simple string or an instance of - #ValidationError with its message attribute set, and what we define as - #list or dictionary can be an actual `list` or `dict` or an instance - #of ValidationError with its `error_list` or `error_dict` attribute set. - - super().__init__(message, code, params) - - if isinstance(message, ValidationError): - if hasattr(message, "error_dict"): - message = message.error_dict - elif not hasattr(message, "message"): - message = message.error_list - else: - message, code, params = message.message, message.code, message.params - - if isinstance(message, dict): - self.error_dict = {} - for field, messages in message.items(): - if not isinstance(messages, ValidationError): - messages = ValidationError(messages) - self.error_dict[field] = messages.error_list - - elif isinstance(message, list): - self.error_list = [] - for message in message: - # Normalize plain strings to instances of ValidationError. - if not isinstance(message, ValidationError): - message = ValidationError(message) - if hasattr(message, "error_dict"): - self.error_list.extend(sum(message.error_dict.values(), [])) - else: - self.error_list.extend(message.error_list) - - else: - self.message = message - self.code = code - self.params = params - self.error_list = [self] - - @property - def message_dict(self): - # Trigger an AttributeError if this ValidationError - # doesn't have an error_dict. - getattr(self, "error_dict") - - return dict(self) - - @property - def messages(self): - if hasattr(self, "error_dict"): - return sum(dict(self).values(), []) - return list(self) - - def update_error_dict(self, error_dict): - if hasattr(self, "error_dict"): - for field, error_list in self.error_dict.items(): - error_dict.setdefault(field, []).extend(error_list) - else: - error_dict.setdefault(NON_FIELD_ERRORS, []).extend(self.error_list) - return error_dict - - def __iter__(self): - if hasattr(self, "error_dict"): - for field, errors in self.error_dict.items(): - yield field, list(ValidationError(errors)) - else: - for error in self.error_list: - message = error.message - if error.params: - message %= error.params - yield str(message) - - def __str__(self): - if hasattr(self, "error_dict"): - return repr(dict(self)) - return repr(list(self)) - - def __repr__(self): - return "ValidationError(%s)" % self - - def __eq__(self, other): - if not isinstance(other, ValidationError): - return NotImplemented - return hash(self) == hash(other) - - def __hash__(self): - if hasattr(self, "message"): - return hash( - ( - self.message, - self.code, - make_hashable(self.params), - ) - ) - if hasattr(self, "error_dict"): - return hash(make_hashable(self.error_dict)) - return hash(tuple(sorted(self.error_list, key=operator.attrgetter("message")))) - - -def make_hashable(value): - - #Attempt to make value hashable or raise a TypeError if it fails. - - #The returned value should generate the same hash for equal values. - - if isinstance(value, dict): - return tuple([ - (key, make_hashable(nested_value)) - for key, nested_value in sorted(value.items()) - ]) - # Try hash to avoid converting a hashable iterable (e.g. string, frozenset) - # to a tuple. - try: - hash(value) - except TypeError: - if is_iterable(value): - return tuple(map(make_hashable, value)) - # Non-hashable, non-iterable. - raise - return value - - -def is_iterable(x): - - #An implementation independent way of checking for iterables - - try: - iter(x) - except TypeError: - return False - else: - return True -""" \ No newline at end of file + None \ No newline at end of file From 07773959a4c266967349fc86462a98b5546cc64d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:49:46 -0700 Subject: [PATCH 308/877] remove import --- mavecore/validation/exceptions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 91d0df9..4751c35 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,5 +1,4 @@ # note: ValidationError2 code in this file is from Django -import operator NON_FIELD_ERRORS = "__all__" From 991b35312beee71e51e3ef487e31240f8a90ffc0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:50:17 -0700 Subject: [PATCH 309/877] add blank line --- mavecore/validation/exceptions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 4751c35..b905287 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -2,5 +2,6 @@ NON_FIELD_ERRORS = "__all__" + class ValidationError(ValueError): None \ No newline at end of file From 4649f8e4f4ae2ac2d93073a743c43f268900152b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 12:50:31 -0700 Subject: [PATCH 310/877] remove unneeded note --- mavecore/validation/exceptions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index b905287..b3e419b 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,5 +1,3 @@ -# note: ValidationError2 code in this file is from Django - NON_FIELD_ERRORS = "__all__" From e3c912e8d8fe990af4f7c0601b57e0dfdefebe6f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:30:33 -0700 Subject: [PATCH 311/877] add description to WordLimitValidator class --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index eafaacf..7e24a28 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -22,7 +22,7 @@ class WordLimitValidator: """ - This class + This class validates the word limit set for a given object. Attributes __________ From 5eafe5a479bdb40621ef02c0a644b5e60fc8896a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:31:39 -0700 Subject: [PATCH 312/877] update code and counter attribute type and descriptions in WordLimitValidator class docstring --- mavecore/validation/dataset_validators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7e24a28..fae858c 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -29,9 +29,9 @@ class WordLimitValidator: message : str Message template to describe how many words a field is limited to. code : str - - counter : str - + code attribute is set to invalid + counter : `re.Pattern` + The regex pattern that will be used to identify the number of words. """ message = "This field is limited to {} words." From abee0c06be3f60c17861c097df2a32c53f900bcd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:31:51 -0700 Subject: [PATCH 313/877] delete comment --- mavecore/validation/dataset_validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index fae858c..c677888 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -39,7 +39,6 @@ class WordLimitValidator: counter = re.compile(r"\w+\b", flags=re.IGNORECASE) def __init__(self, word_limit, message=None, code=None): - # check the code parameter type """ This constructor sets the values of the WordLimitValidator class attributes message, code, and counter. From 39c420bfff7d007c1b246befa4e49d7f8e11d539 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:32:39 -0700 Subject: [PATCH 314/877] edit description of message parameter and type of code parameter in constructor --- mavecore/validation/dataset_validators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index c677888..8c28ccc 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -48,8 +48,8 @@ def __init__(self, word_limit, message=None, code=None): word_limit : int The word limit assigned to the word limit attribute. message : str - (default = None) The message assigned to the message attribute. - code : + (default = None) The value assigned to the message attribute that is displayed when an error is raised. + code : str (default = None) The code assigned to the code attribute. """ if message is not None: From dcac693b76cdd287e5aa76c0651ca60a379e2004 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:33:07 -0700 Subject: [PATCH 315/877] add description to __call__ method docstring --- mavecore/validation/dataset_validators.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 8c28ccc..5e5af91 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -60,6 +60,10 @@ def __init__(self, word_limit, message=None, code=None): def __call__(self, value): """ + This special method will raise a ValidationError if the number of times the regex pattern (defined by the + counter attribute) found in the value parameter exceeds the word_limit attribute value. In short, __call__ + checks if the number of words exceeds the word_limit. + Parameters __________ value : From 16cfe072d2d417fb97cc973568adf2a420f15896 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:33:27 -0700 Subject: [PATCH 316/877] edit value parameter type and description in __call__ method docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 5e5af91..f2673fd 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -66,7 +66,8 @@ def __call__(self, value): Parameters __________ - value : + value : str + The string in which the pattern defined in the counter attribute will be found. Returns _______ From 6eb9e641d10f4a5c279e5cb51d4e573f43bee8a7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:33:45 -0700 Subject: [PATCH 317/877] edit return in __call__ method docstring --- mavecore/validation/dataset_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index f2673fd..7b58096 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -71,6 +71,7 @@ def __call__(self, value): Returns _______ + If value is not empty or false. Raises ______ From 458155e12af4513c35dbe6a610ad4dafbb6e40c9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:34:10 -0700 Subject: [PATCH 318/877] edit error description in __call__ method docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 7b58096..cf10810 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -76,7 +76,8 @@ def __call__(self, value): Raises ______ ValidationError - If + If the number of times the regex pattern (defined by the counter attribute) found in the value parameter + exceeds the word_limit attribute value. """ if not value: return From a46d4cd720ec3dbb07de2aab220f05e2178764e8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:34:56 -0700 Subject: [PATCH 319/877] update msg parameter description in read_header_from_io function docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index cf10810..e4ca725 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -97,7 +97,8 @@ def read_header_from_io(file, label=None, msg=None): label : str (default = None) msg : str - (default = None) The message that is printed in the event of an error is raised. + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Returns _______ From 167d97b0f4904a3a9c264963fa56154377875d6b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:35:29 -0700 Subject: [PATCH 320/877] add description to validate_has_hgvs_in_header function docstring --- mavecore/validation/dataset_validators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index e4ca725..b42ee30 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -133,6 +133,8 @@ def read_header_from_io(file, label=None, msg=None): def validate_has_hgvs_in_header(header, label=None, msg=None): """ + Determines whether or not hgvs is in a header. + Parameters __________ header : From bc0481810a2c7974787918334243099344ca372e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:36:20 -0700 Subject: [PATCH 321/877] add type and description description to header parameter in validate_has_hgvs_in_header function docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index b42ee30..37258f4 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -137,7 +137,8 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): Parameters __________ - header : + header : str + The first line of the file being validated. label : default = None msg : From 3dbb168b68446c9ee1d58904bb05dd7367159d46 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:36:47 -0700 Subject: [PATCH 322/877] edit description description in label parameter in validate_has_hgvs_in_header function docstring --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 37258f4..1164de5 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -140,7 +140,7 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): header : str The first line of the file being validated. label : - default = None + (default = None) msg : default = None From 8771a5cbebf9b9226405babe46986879426114bd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:37:04 -0700 Subject: [PATCH 323/877] add description to msg parameter in validate_has_hgvs_in_header function docstring --- mavecore/validation/dataset_validators.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 1164de5..9a4806b 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -142,7 +142,8 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): label : (default = None) msg : - default = None + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ From 04cfee53f728202fc106a0ccdad639464c62ae36 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:37:32 -0700 Subject: [PATCH 324/877] add description to ValidationError in validate_has_hgvs_in_header function docstring --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 9a4806b..8e21ccb 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -148,7 +148,7 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): Raises ______ ValidationError - If + If the header is empty and there exists a value for the constants.hgvs_columns parameter. """ if label is None: label = "Uploaded" From 31e7a69b8807ef336571f0e4bf2bc114cab73618 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:37:46 -0700 Subject: [PATCH 325/877] delete comment and TODO --- mavecore/validation/dataset_validators.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 8e21ccb..b3476a1 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -169,8 +169,6 @@ def validate_has_hgvs_in_header(header, label=None, msg=None): def validate_at_least_one_additional_column(header, label=None, msg=None): - # TODO - # verify parameter types """ This function checks the passed header to see if there exists additional columns besides the three specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. From a88c0297e0f172fb10e4d3d966053199683b8c3c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:38:32 -0700 Subject: [PATCH 326/877] edit types and descriptions to header, label, and msg parameters in validate_at_least_one_additional_column docstring --- mavecore/validation/dataset_validators.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index b3476a1..ed56581 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -175,11 +175,13 @@ def validate_at_least_one_additional_column(header, label=None, msg=None): Parameters __________ - header : - label : - default = None - msg : - default = None + header : str + The first line of the file being validated. + label : str + (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ From fbdeda25efc1a23a72ce1edd7231739114817efe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:39:06 -0700 Subject: [PATCH 327/877] edit types and descriptions to header, label, and msg parameters in validate_header_contains_no_null_columns docstring --- mavecore/validation/dataset_validators.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index ed56581..872dc33 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -213,11 +213,13 @@ def validate_header_contains_no_null_columns(header, label=None, msg=None): Parameters __________ - header : - label : - (default = None) - msg : + header : str + The first line of the file being validated. + label : str (default = None) + msg : str + (default = None) The message that is printed in the event of an error is raised. The value is updated within + the function. Raises ______ From be927e23c4e6f028ed4e15436ae4d97949c10553 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 17:46:39 -0700 Subject: [PATCH 328/877] add description to validate_hgvs_string function docstring --- mavecore/validation/variant_validators/hgvs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 4ef39e3..6712474 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -22,6 +22,7 @@ def validate_hgvs_string( relaxed_ordering: bool = False, ) -> Optional[str]: """ + Validates hgvs string. Parameters __________ From 5db822f75029f7663bce7863876a5028d6769e86 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 17:47:08 -0700 Subject: [PATCH 329/877] add parameter type to validate_hgvs_string function docstring parameters --- mavecore/validation/variant_validators/hgvs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 6712474..4db2a1f 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -28,9 +28,9 @@ def validate_hgvs_string( __________ value : Union[str, bytes] column : Optional[str] = None - splice_present : - targetseq : - relaxed_ordering : + splice_present : bool = False + targetseq : Optional[str] = None + relaxed_ordering : bool = False Returns _______ From 3879191750fd294da4793684f9a07e70e895baf1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 17:47:22 -0700 Subject: [PATCH 330/877] add return type to validate_hgvs_string function docstring --- mavecore/validation/variant_validators/hgvs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation/variant_validators/hgvs.py index 4db2a1f..b423ecf 100644 --- a/mavecore/validation/variant_validators/hgvs.py +++ b/mavecore/validation/variant_validators/hgvs.py @@ -34,6 +34,7 @@ def validate_hgvs_string( Returns _______ + Optional[str] Raises ______ From f662a81e8633242698f50c76c25ab5ef05bb7b23 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 18:08:29 -0700 Subject: [PATCH 331/877] delete TODO and comment --- mavecore/validation/variant_validators/variant.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index bf00e71..68e3849 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -9,8 +9,6 @@ def validate_columns_match(variant, scoreset) -> None: - # TODO - # document errors correctly, note key error """ Validate that a child matches parents defined columns to keep data in sync. From e6a86fc4371f631194341e66e2897f35f3e94025 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 18:08:42 -0700 Subject: [PATCH 332/877] add description to error in docstring --- mavecore/validation/variant_validators/variant.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation/variant_validators/variant.py index 68e3849..140fed4 100644 --- a/mavecore/validation/variant_validators/variant.py +++ b/mavecore/validation/variant_validators/variant.py @@ -24,6 +24,8 @@ def validate_columns_match(variant, scoreset) -> None: If variant score columns do not match scoreset score columns. ValidationError If variant count columns do not match scoreset count columns. + ValidationError + If try fails within try except block. """ try: if variant.score_columns != scoreset.score_columns: From fed91e3441f8141a3e12e5b32a9b91eab6c25cf9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 18:14:36 -0700 Subject: [PATCH 333/877] delete comments --- mavecore/validation/dataset_validators.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 872dc33..330772d 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -9,16 +9,6 @@ from mavecore.validation.utilities import is_null -#from mavecore.validation.general_validators import FileExtensionValidator - -# TODO reimplement file extension validators from MaveDB -# validate_csv_extension -# validate_gz_extension -# validate_json_extension -#TODO find another way to validate file extensions -#validate_csv_extension = FileExtensionValidator(allowed_extensions=["csv"]) -#validate_gz_extension = FileExtensionValidator(allowed_extensions=["gz"]) -#validate_json_extension = FileExtensionValidator(allowed_extensions=["json"]) class WordLimitValidator: """ From 049bda0c7a90332e1bd96cf71c63ff93d2c6bf47 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:17:04 -0700 Subject: [PATCH 334/877] add and complete docstring for validate_sra_identifier --- mavecore/validation/metadata_validators.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index cb0e450..4d17fc6 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -6,6 +6,19 @@ def validate_sra_identifier(identifier): + """ + Validates whether the identifier is a valid SRA identifier. + + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid SRA identifier. + """ if not ( idutils.is_sra(identifier) or idutils.is_bioproject(identifier) From 0db184d5c124f41940d6f9099565077215eb9e8d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:17:32 -0700 Subject: [PATCH 335/877] add description to docstring --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 4d17fc6..2a7d524 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -56,6 +56,7 @@ def validate_keyword(kw): def validate_pubmed_identifier(identifier): """ + Validates whether the identifier is a valid PubMed identifier. :param identifier: :return: From d75aa13f6c397607b2c78082a787cb7f2968550a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:17:41 -0700 Subject: [PATCH 336/877] add description to docstring --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 2a7d524..2ae6d46 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -67,6 +67,7 @@ def validate_pubmed_identifier(identifier): def validate_doi_identifier(identifier): """ + Validates whether the identifier is a valid DOI identifier. :param identifier: :return: From a436919b4c5883b91cfb10fc46054c8280e46cca Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:17:48 -0700 Subject: [PATCH 337/877] add description to docstring --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 2ae6d46..ed38796 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -78,6 +78,7 @@ def validate_doi_identifier(identifier): def validate_ensembl_identifier(identifier): """ + Validates whether the identifier is a valid Ensembl identifier. :param identifier: :return: From 612ca574ffcb0c98e22a6e19e8863b8368e0814e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:18:16 -0700 Subject: [PATCH 338/877] add parameters and errors to docstring for validate_pubmed_identifier --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index ed38796..93fac21 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -58,8 +58,15 @@ def validate_pubmed_identifier(identifier): """ Validates whether the identifier is a valid PubMed identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid PubMed identifier. """ if not idutils.is_pmid(identifier): raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") From e7e0809c67f556833dd00f9022911429b70db41e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:18:26 -0700 Subject: [PATCH 339/877] add parameters and errors to docstring for validate_doi_identifier --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 93fac21..3e3dc7e 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -76,8 +76,15 @@ def validate_doi_identifier(identifier): """ Validates whether the identifier is a valid DOI identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid DOI identifier. """ if not idutils.is_doi(identifier): raise ValidationError(f"'{identifier}' is not a valid DOI.") From 4bccc50bffecbbf13af95900e09eccb0da474dfb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:18:38 -0700 Subject: [PATCH 340/877] add parameters and errors to docstring for validate_ensembl_identifier --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 3e3dc7e..16a6eb0 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -94,8 +94,15 @@ def validate_ensembl_identifier(identifier): """ Validates whether the identifier is a valid Ensembl identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid Ensembl identifier. """ if not idutils.is_ensembl(identifier): raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") From 78a82d86098c6928954462cc3b7e07e1d9bebaf9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:18:56 -0700 Subject: [PATCH 341/877] add description to docstring for validate_uniprot_identifier --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 16a6eb0..e72fded 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -110,6 +110,7 @@ def validate_ensembl_identifier(identifier): def validate_uniprot_identifier(identifier): """ + Validates whether the identifier is a valid UniProt identifier. :param identifier: :return: From 4f51d5e2d4ba23d2ee605fdff473dadad07fa30b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:19:28 -0700 Subject: [PATCH 342/877] add parameters and raised errors to docstring for validate_uniprot_identifier --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index e72fded..a258e79 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -112,8 +112,15 @@ def validate_uniprot_identifier(identifier): """ Validates whether the identifier is a valid UniProt identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid UniProt identifier. """ if not idutils.is_uniprot(identifier): raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") From 377149b19542f0167490492df30506348ead288f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:19:46 -0700 Subject: [PATCH 343/877] add parameters and raised errors to docstring for validate_refseq_identifier --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index a258e79..dc6e394 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -129,8 +129,15 @@ def validate_uniprot_identifier(identifier): def validate_refseq_identifier(identifier): """ - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid RefSeq identifier. """ if not idutils.is_refseq(identifier): raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") From f2e32321a167d7d34165a4a2ff09d2ecb682bbba Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:20:05 -0700 Subject: [PATCH 344/877] add parameters and raised errors to docstring for validate_uniprot_list --- mavecore/validation/metadata_validators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index dc6e394..a1ab44b 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -228,8 +228,15 @@ def validate_refseq_list(values): def validate_uniprot_list(values): """ - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid UniProt identifier. """ for value in values: if not is_null(value): From 600bde113455703fdda4492ffdf85bd2e4683350 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:20:45 -0700 Subject: [PATCH 345/877] write complete docstring for validate_refseq_list --- mavecore/validation/metadata_validators.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index a1ab44b..f5de556 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -216,9 +216,17 @@ def validate_ensembl_list(values): def validate_refseq_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid RefSeq identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid RefSeq identifier. """ for value in values: if not is_null(value): From 8e579d5bea97bc92217ad41573add67e91cae61e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:21:17 -0700 Subject: [PATCH 346/877] write complete docstring for validate_genome_identifier --- mavecore/validation/metadata_validators.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index f5de556..69137f2 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -145,9 +145,17 @@ def validate_refseq_identifier(identifier): def validate_genome_identifier(identifier): """ + Validates whether the identifier is a valid genome identifier. - :param identifier: - :return: + Parameters + __________ + identifier: str + The identifier to be validated. + + Raises + ______ + ValidationError + If the identifier is not a valid genome identifier. """ if not idutils.is_genome(identifier): raise ValidationError( From 50c2e47eef695daee52c3d68e8d398da41374a88 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:21:34 -0700 Subject: [PATCH 347/877] write complete docstring for validate_pubmed_list --- mavecore/validation/metadata_validators.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 69137f2..0a2f87e 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -181,8 +181,17 @@ def validate_keyword_list(values): def validate_pubmed_list(values): """ - :param values: - :return: + Validates whether each identifier in a list of identifiers (values) is a valid PubMed identifier. + + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid PubMed identifier. """ for value in values: if not is_null(value): From cb183dff12b53cd5a3d3921871a4eb5e5c2066cd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:21:47 -0700 Subject: [PATCH 348/877] write complete docstring for validate_sra_list --- mavecore/validation/metadata_validators.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index 0a2f87e..f1b01ed 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -200,9 +200,17 @@ def validate_pubmed_list(values): def validate_sra_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid SRA identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid SRA identifier. """ for value in values: if not is_null(value): From 8252543132ceb615ad07e49d68ef4f22ab1a0240 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:22:05 -0700 Subject: [PATCH 349/877] write complete docstring for validate_doi_list --- mavecore/validation/metadata_validators.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index f1b01ed..c661edf 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -219,9 +219,17 @@ def validate_sra_list(values): def validate_doi_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid DOI identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid DOI identifier. """ for value in values: if not is_null(value): From f6e6c145427da9a9b6d5120d93525e4913588460 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:22:20 -0700 Subject: [PATCH 350/877] write complete docstring for validate_ensembl_list --- mavecore/validation/metadata_validators.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index c661edf..e2b0978 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -238,9 +238,17 @@ def validate_doi_list(values): def validate_ensembl_list(values): """ + Validates whether each identifier in a list of identifiers (values) is a valid Ensembl identifier. - :param values: - :return: + Parameters + __________ + identifier: List[str] + The list of identifiers to be validated. + + Raises + ______ + ValidationError + If at least one of the identifiers is not a valid Ensemble identifier. """ for value in values: if not is_null(value): From 0fdd63bbf68d0a75cbc7c211179b08b083965047 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:22:47 -0700 Subject: [PATCH 351/877] write description for validate_refseq_identifier docstring --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index e2b0978..d4e44cd 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -128,6 +128,7 @@ def validate_uniprot_identifier(identifier): def validate_refseq_identifier(identifier): """ + Validates whether the identifier is a valid RefSeq identifier. Parameters __________ From b4acdf03c17f1bfa75c05c555f0385620fd5ce97 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:23:00 -0700 Subject: [PATCH 352/877] write description for validate_uniprot_list docstring --- mavecore/validation/metadata_validators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index d4e44cd..da5f041 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -277,6 +277,7 @@ def validate_refseq_list(values): def validate_uniprot_list(values): """ + Validates whether each identifer in a list of identifiers (values) is a valid UniProt identifier. Parameters __________ From 934a297ca2b8d86d49ab3d0759b473d4f7aa3f52 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:23:11 -0700 Subject: [PATCH 353/877] delete comment --- mavecore/validation/metadata_validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation/metadata_validators.py index da5f041..70b731b 100644 --- a/mavecore/validation/metadata_validators.py +++ b/mavecore/validation/metadata_validators.py @@ -1,4 +1,3 @@ -# TODO Django dependent, Django forms, whole file needs to be refactored import idutils from mavecore.validation.exceptions import ValidationError From cfc3ec0147f3686349b11775608c06f58b4ad828 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:49:00 -0700 Subject: [PATCH 354/877] delete original_validation --- mavecore/original_validation/__init__.py | 0 mavecore/original_validation/constants.py | 90 -- .../original_validation/dataset_validators.py | 377 ------ mavecore/original_validation/exceptions.py | 2 - .../original_validation/genome_validators.py | 605 ---------- .../metadata_validators.py | 203 ---- .../original_validation/urn_validators.py | 153 --- mavecore/original_validation/validate.py | 69 -- .../variant_validators/__init__.py | 25 - .../variant_validators/dataset.py | 1012 ----------------- .../variant_validators/hgvs.py | 134 --- .../variant_validators/variant.py | 85 -- 12 files changed, 2755 deletions(-) delete mode 100644 mavecore/original_validation/__init__.py delete mode 100644 mavecore/original_validation/constants.py delete mode 100644 mavecore/original_validation/dataset_validators.py delete mode 100644 mavecore/original_validation/exceptions.py delete mode 100644 mavecore/original_validation/genome_validators.py delete mode 100644 mavecore/original_validation/metadata_validators.py delete mode 100644 mavecore/original_validation/urn_validators.py delete mode 100644 mavecore/original_validation/validate.py delete mode 100644 mavecore/original_validation/variant_validators/__init__.py delete mode 100644 mavecore/original_validation/variant_validators/dataset.py delete mode 100644 mavecore/original_validation/variant_validators/hgvs.py delete mode 100644 mavecore/original_validation/variant_validators/variant.py diff --git a/mavecore/original_validation/__init__.py b/mavecore/original_validation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/mavecore/original_validation/constants.py b/mavecore/original_validation/constants.py deleted file mode 100644 index 6630323..0000000 --- a/mavecore/original_validation/constants.py +++ /dev/null @@ -1,90 +0,0 @@ -import re - -""" -Null Constant definitions -""" -NA_value = "NA" -null_values_list = ( - "nan", - "na", - "none", - "", - "undefined", - "n/a", - "null", - "nil", - NA_value, -) - -null_values_re = re.compile( - r"^\s+$|none|nan|na|undefined|n/a|null|nil|{}".format(NA_value), flags=re.IGNORECASE -) - -readable_null_values = [ - "'{}'".format(v) for v in set([v.lower() for v in null_values_list]) if v.strip() -] + ["whitespace"] - -""" -Sequence constants -""" -AA_LETTERS = "ABCDEFGHIKLMNPQRSTVWXYZ" -DNA_LETTERS = "ATCG" - -DNA_SEQ_PATTERN = rf"[{DNA_LETTERS}]+" -AA_SEQ_PATTERN = rf"[{AA_LETTERS}]+" - - -""" -Constant definitions for application `experiment`. -""" -from mavecore.validation.urn_validators import ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, -) - -hgvs_nt_column = "hgvs_nt" -hgvs_splice_column = "hgvs_splice" -hgvs_pro_column = "hgvs_pro" -hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) -meta_data = "meta_data" -score_columns = "score_columns" -count_columns = "count_columns" -variant_score_data = "score_data" -variant_count_data = "count_data" -required_score_column = "score" - -experimentset_url_pattern = "|".join( - [MAVEDB_EXPERIMENTSET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -experiment_url_pattern = "|".join( - [MAVEDB_EXPERIMENT_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) -scoreset_url_pattern = "|".join( - [MAVEDB_SCORESET_URN_PATTERN[1:-1], MAVEDB_TMP_URN_PATTERN[1:-1]] -) - -any_url_pattern = "|".join( - [experimentset_url_pattern, experiment_url_pattern, scoreset_url_pattern] -) - - -valid_dataset_columns = [score_columns, count_columns] -valid_variant_columns = [variant_score_data, variant_count_data] - -variant_to_scoreset_column = { - variant_score_data: score_columns, - variant_count_data: count_columns, -} -scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} - -# Celery dataset status -processing = "processing" -failed = "failed" -success = "success" - -# User roles -administrator = "administrator" -editor = "editor" -viewer = "viewer" diff --git a/mavecore/original_validation/dataset_validators.py b/mavecore/original_validation/dataset_validators.py deleted file mode 100644 index fd7a20e..0000000 --- a/mavecore/original_validation/dataset_validators.py +++ /dev/null @@ -1,377 +0,0 @@ -import io -import csv -import re - -from numpy.testing import assert_array_equal - -from mavecore.validation import constants - - -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : str - The value to be checked as null or not. - - Returns - _______ - bool - True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -class WordLimitValidator: - """ - This class - - Attributes - __________ - message : str - Message template to describe how many words a field is limited to. - code : str - - counter : str - - """ - - message = "This field is limited to {} words." - code = "invalid" - counter = re.compile(r"\w+\b", flags=re.IGNORECASE) - - def __init__(self, word_limit, message=None, code=None): - # TODO - # check the code parameter type - """ - This constructor sets the values of the WordLimitValidator class attributes - message, code, and counter. - - Parameters - __________ - word_limit : int - The word limit assigned to the word limit attribute. - message : str - (default = None) The message assigned to the message attribute. - code : - (default = None) The code assigned to the code attribute. - """ - if message is not None: - self.message = message - if code is not None: - self.code = code - self.word_limit = int(word_limit) - - def __call__(self, value): - """ - Parameters - __________ - value : - - Returns - _______ - - Raises - ______ - ValueError - If - """ - if not value: - return - if len(self.counter.findall(value)) > self.word_limit: - raise ValueError(self.message.format(self.word_limit)) - - -def read_header_from_io(file, label=None, msg=None): - # TODO - # confirm types for parameters - """ - This takes a file and reads the header from that file. - - Parameters - __________ - file : - label : str - (default = None) - msg : str - (default = None) The message that is printed in the event of an error is raised. - - Returns - _______ - str - The header that was read from io. - - Raises - ______ - ValueError - If a header could not be parsed from file. Columns must be coma delimited. Column names - with commas must be escaped by enclosing them in double quotes. - """ - if label is None: - label = "uploaded" - - try: - header_line = file.readline() - if isinstance(header_line, bytes): - header_line = header_line.decode() - file.seek(0) - f = io.StringIO(header_line.strip()) - return [h.strip() for h in csv.DictReader(f, delimiter=",").fieldnames] - except Exception: - if not msg: - msg = ( - "A header could not be parsed from your {} file. Make sure" - "Columns are comma delimited. Column names with commas must be" - "escaped by enclosing them in double quotes.".format(label) - ) - raise ValueError(msg) - - -def validate_has_hgvs_in_header(header, label=None, msg=None): - """ - Parameters - __________ - header : - label : - default = None - msg : - default = None - - Raises - ______ - ValueError - If - """ - if label is None: - label = "Uploaded" - params = {} - if msg is None: - msg = ( - "Your %(label)s file must define either a nucleotide hgvs column " - "'%(col_nt)s' or a protein hgvs column '%(col_p)s'. " - "Columns are case-sensitive and must be comma delimited." - ) - params = { - "label": label, - "col_nt": constants.hgvs_nt_column, - "col_p": constants.hgvs_pro_column, - } - if not set(header) & set(constants.hgvs_columns): - raise ValueError(msg) - - -def validate_at_least_one_additional_column(header, label=None, msg=None): - # TODO - # verify parameter types - """ - This function checks the passed header to see if there exists additional columns besides the three - specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. - - Parameters - __________ - header : - label : - default = None - msg : - default = None - - Raises - ______ - ValueError - If there are not additional columns in the header argument. - """ - if label is None: - label = "Uploaded" - params = {} - if not any(v not in constants.hgvs_columns for v in header): - if msg is None: - msg = ( - "Your %(label)s file must define at " - "least one additional column different " - "from '{}', '{}' and '{}'.".format( - constants.hgvs_nt_column, - constants.hgvs_splice_column, - constants.hgvs_pro_column, - ) - ) - params = {"label": label} - raise ValueError(msg) - - -def validate_header_contains_no_null_columns(header, label=None, msg=None): - """ - This function checks that the header parameter does not contain any null columns that - are not in the case-insensitive null values listed in constants.readable_null_values. - - Parameters - __________ - header : - label : - (default = None) - msg : - (default = None) - - Raises - ______ - ValueError - If the file header contains blank/empty/whitespace. Only columns or the - case-insensitive null values listed in constants.readable_null_values - are permitted. - """ - if label is None: - label = "File" - any_null = any([is_null(v) for v in header]) - if any_null: - if msg is None: - msg = ( - "%(label)s file header cannot contain blank/empty/whitespace " - "only columns or the following case-insensitive null " - "values: {}.".format( - label, ", ".join(constants.readable_null_values_list) - ) - ) - raise ValueError(msg) - - -def validate_datasets_define_same_variants(scores, counts): - """ - Checks if two `pd.DataFrame` objects parsed from uploaded files - define the same variants. - - Parameters - ---------- - scores : `pd.DataFrame` - Scores dataframe parsed from an uploaded scores file. - counts : `pd.DataFrame` - Scores dataframe parsed from an uploaded counts file. - - Raises - ______ - ValueError - If score and counts files do not define the same variants. - """ - try: - assert_array_equal( - scores[constants.hgvs_nt_column].sort_values().values, - counts[constants.hgvs_nt_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_splice_column].sort_values().values, - counts[constants.hgvs_splice_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_pro_column].sort_values().values, - counts[constants.hgvs_pro_column].sort_values().values, - ) - except AssertionError: - raise ValueError( - "Your score and counts files do not define the same variants. " - "Check that the hgvs columns in both files match." - ) - - -def validate_scoreset_score_data_input(file): - """ - Validator function for checking that the scores file input contains - at least the column 'hgvs' and 'score'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - An open file handle in read mode. - - Raises - ______ - ValueError - If score data file is missing the required column constants.required_score_column - """ - file.seek(0) - header = read_header_from_io(file, label="Score") - validate_header_contains_no_null_columns(header, label="Score") - validate_has_hgvs_in_header(header, label="Score") - validate_at_least_one_additional_column(header, label="Score") - - if constants.required_score_column not in header: - raise ValueError( - "Score data file is missing the required column " - + constants.required_score_column - + "." - + "Columns are case-sensitive and must be comma delimited." - ) - - -def validate_scoreset_count_data_input(file): - """ - Validator function for checking that the counts file input contains - at least the column 'hgvs'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - File parsed by a `django` form. - """ - file.seek(0) - header = read_header_from_io(file, label="Count") - validate_header_contains_no_null_columns(header, label="Count") - validate_has_hgvs_in_header(header, label="Count") - validate_at_least_one_additional_column(header, label="Count") - - -def validate_scoreset_json(dict_): - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `dataset_columns` attribute in a :class:`ScoreSet` instance. - - Parameters - ---------- - dict_ : dict - Dictionary of keys mapping to a list. - - Raises - ______ - ValueError - If scoreset data is missing the required key. - ValueError - If header values are not strings. - ValueError - If - ValueError - If missing required column constants.required_score_column for score dataset. - ValueError - If encountered unexpected keys extras. - """ - required_columns = [constants.score_columns, constants.count_columns] - - for key in required_columns: - if key not in dict_.keys(): - raise ValueError("Scoreset data is missing the required key " + key) - - columns = dict_[key] - if not all([isinstance(c, str) for c in columns]): - raise ValueError("Header values must be strings.") - - if not isinstance(columns, list): - type_ = type(columns).__name__ - raise ValueError( - "Value for " + key.replace("_", " ") + " must be a list not " + type_ - ) - - # Check score columns is not-empty and at least contains hgvs and score - if key == constants.score_columns: - if constants.required_score_column not in columns: - raise ValueError( - "Missing required column constants.required_score_column " - "for score dataset." - ) - - # Check there are not unexpected columns supplied to the scoreset json - # field. - extras = [k for k in dict_.keys() if k not in set(required_columns)] - if len(extras) > 0: - extras = [k for k in dict_.keys() if k not in required_columns] - raise ValueError("Encountered unexpected keys extras") diff --git a/mavecore/original_validation/exceptions.py b/mavecore/original_validation/exceptions.py deleted file mode 100644 index 2851fa7..0000000 --- a/mavecore/original_validation/exceptions.py +++ /dev/null @@ -1,2 +0,0 @@ -class ValidationError(ValueError): - pass diff --git a/mavecore/original_validation/genome_validators.py b/mavecore/original_validation/genome_validators.py deleted file mode 100644 index dff8b69..0000000 --- a/mavecore/original_validation/genome_validators.py +++ /dev/null @@ -1,605 +0,0 @@ -""" -Validator functions for the fields of the following classes: - WildTypeSequence - ReferenceGenome - TargetGene - ReferenceMap - GenomicInterval - -Most validation should validate one specific field, unless fields need -to be validated against each other. -""" -from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation import constants - - -def is_null(value): - """ - This function checks if the value exists or is null. - - Parameters - __________ - value : - The value to be checked. - - Returns - _______ - bool - True if a stripped/lowercase value in `nan_col_values`. - """ - value = str(value).strip().lower() - return constants.null_values_re.fullmatch(value) or not value - - -# min_start_validator = MinValueValidator( -# 1, message=_("Start coordinate must be a positive integer.") -# ) -# min_end_validator = MinValueValidator( -# 1, message=_("End coordinate must be a positive integer.") -# ) - - -class WildTypeSequence: - """ - Basic model specifying a wild-type sequence. - - Parameters - ---------- - sequence : `models.CharField` - The wild type DNA sequence that is related to the `target`. Will - be converted to upper-case upon instantiation. - - sequence_type : `models.CharField` - Protein sequence (amino acids) or DNA (nucleotides) - """ - - class SequenceType: - """ """ - - DNA = "dna" - PROTEIN = "protein" - INFER = "infer" - - @classmethod - def detect_sequence_type(cls, sequence): - # TODO - # confirm sequence parameter type - """ - This function determines if the sequence is a DNA or protein sequence and - returns "dna" if it is DNA or "protein" if it is protein. An error is raised - if it is neither. - - Parameters - __________ - sequence : str - - Returns - _______ - str - "dna" or "protein" depending on if the sequence is a DNA or protein sequence. - - Raises - ______ - ValueError - If sequence parameter is not protein or DNA. - """ - if sequence_is_dna(sequence): - return cls.DNA - elif sequence_is_protein(sequence): - return cls.PROTEIN - else: - raise ValueError( - f"Unknown sequence '{sequence}'. It is not protein or DNA." - ) - - @classmethod - def is_protein(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.PROTEIN - - @classmethod - def is_dna(cls, value): - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.DNA - - @classmethod - def choices(cls): - """ - - Returns - _______ - """ - return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - - class Meta: - """ """ - - verbose_name = "Reference sequence" - verbose_name_plural = "Reference sequences" - - def __str__(self): - """ - - Returns - _______ - - """ - return self.get_sequence() - - # sequence = models.TextField( - # default=None, - # blank=False, - # null=False, - # verbose_name="Reference sequence", - # validation=[validate_wildtype_sequence], - # ) - # sequence_type = models.CharField( - # blank=True, - # null=False, - # default=SequenceType.INFER, - # verbose_name="Reference sequence type", - # max_length=32, - # choices=SequenceType.choices(), - # ) - - @property - def is_dna(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_dna(self.sequence_type) - - @property - def is_protein(self): - """ - - Returns - _______ - - """ - return self.__class__.SequenceType.is_protein(self.sequence_type) - - def save(self, *args, **kwargs): - """ - - Parameters - __________ - args : - kwargs : - - Returns - _______ - - """ - if self.sequence is not None: - self.sequence = self.sequence.upper() - self.sequence_type = ( - (self.__class__.SequenceType.detect_sequence_type(self.sequence)) - if self.__class__.SequenceType.INFER - else self.sequence_type - ) - - return super().save(*args, **kwargs) - - def get_sequence(self): - """ - - Returns - _______ - - """ - return self.sequence.upper() - - def is_attached(self): - """ - - Returns - _______ - - """ - return getattr(self, "target", None) is not None - - -# GenomicInterval -# ------------------------------------------------------------------------- # -def validate_interval_start_lteq_end(start, end): - """ - This function validates whether or not an interval's starting coordinate is less than - or equal to that interval's ending coordinate. - - Parameters - __________ - start : int - The interval's starting coordinate. - end : int - The interval's ending coordinate. - - Returns - _______ - None - If start is NoneType or end is NoneType. - - Raises - ______ - ValidationError - If an interval's starting coordinate is greater than the ending coordinate. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if start is None or end is None: - return - if start > end: - raise ValidationError( - ( - "An interval's starting coordinate cannot be greater than the " - "ending coordinate." - ) - ) - - -def validate_strand(value): - # TODO - # find the type of value - """ - This function validates a GenomicInterval strand and raises an error if the strand is invalid. - - Parameters - __________ - value : - The Genomic Interval strand to be validated. - - Raises - ______ - ValidationError - If GenomicInterval strand is not positive or negative. - """ - if value not in ("+", "-"): - raise ValidationError("GenomicInterval strand must be either '+' or '-'") - - -def validate_chromosome(value): - # TODO - # add description and type for value parameter - """ - - Parameters - __________ - value : - - Returns - _______ - None - If value is NoneType. - - Raises - ______ - ValidationError - If chromosome identifier is null. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if value is None: - return - if is_null(value): - raise ValidationError("Chromosome identifier must not be null.") - - -def validate_unique_intervals(intervals): - # TODO - # add description and interval parameter type plus description - """ - - Parameters - __________ - intervals : - - Raises - ______ - ValidationError - If the same interval was specified twice. - """ - for interval1 in intervals: - for interval2 in intervals: - if ( - (interval1.pk is not None) - and (interval2.pk is not None) - and (interval1.pk == interval2.pk) - ): - continue - elif interval1 is interval2: - continue - elif interval1.equals(interval2): - raise ValidationError("You can not specify the same interval twice.") - - -# WildTypeSequence -# ------------------------------------------------------------------------- # -def validate_wildtype_sequence(seq, as_type="any"): - # TODO - # add description to as_type parameter - """ - This function checks whether or not seq is a wildtype sequence. - - Parameters - __________ - seq : str - The sequence being validated. - as_type : str - (default = "any") - - Raises - ______ - ValidationError - If seq is not a valid wild type sequence. - ValidationError - If seq is not a valid DNA or protein reference sequence. - """ - # from .models import WildTypeSequence - - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - raise ValidationError( - "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} - ) - - seq = seq.upper() - is_dna = dna_bases_validator(seq) is not None - is_aa = amino_acids_validator(seq) is not None - - if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: - raise ValidationError( - "'%(seq)s' is not a valid DNA reference sequence." # , - # params={"seq": seq}, - ) - elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: - raise ValidationError( - "'%(seq)s' is not a valid protein reference sequence." # , - # params={"seq": seq}, - ) - elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( - is_dna or is_aa - ): - raise ValidationError( - "'%(seq)s' is not a valid DNA or protein reference sequence." # , - # params={"seq": seq}, - ) - - -def sequence_is_dna(seq): - """ - This function checks if seq is a DNA sequence. - - Parameters - __________ - seq : str - The sequence to be validated. - - Returns - _______ - bool - True if the dna_bases_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - return dna_bases_validator(seq) is not None - - -def sequence_is_protein(seq): - """ - This function check if seq is a protein sequence. - - Parameters - __________ - seq : str - The sequence being validated. - - Returns - _______ - bool - True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - if dna_bases_validator(seq) is not None: - return False # Very likely a DNA sequence if only ATG - return amino_acids_validator(seq) is not None - - -# ReferenceGenome -# ------------------------------------------------------------------------- # -def validate_organism_name(organism_name): - # TODO - # confirm organism_name type - """ - This function validates the organism name by checking that the name is not null. - - Parameters - __________ - organism_name : str - The organism name to be validated. - - Raises - ______ - ValidationError - If the organism name is null. - """ - if is_null(organism_name): - raise ValidationError("Species name must not be null.") - - -def validate_reference_genome_has_one_external_identifier(referencegenome): - # TODO - # revise description, make sure it is accurate - # anything greater than 0 will return True, so should it be == 1 or > 0? - # determine what type referencegenome is - """ - This function validates whether or not the reference genome has one external identifier. - An error is raised if - - Parameters - __________ - referencegenome : - - Raises - ______ - ValidationError - If - """ - if not referencegenome.genome_id: - raise ValidationError( - "Only one external identifier can be specified for a reference" "genome." - ) - - -def validate_genome_short_name(value): - # TODO - # confirm the type of the value parameter - """ - This function validates the genome short name and raises an error if the value is null. - - Parameters - __________ - value : str - The genome short name to be validated. - - Raises - ______ - ValidationError - If the genome short name is null. - """ - if is_null(value): - raise ValidationError("Genome short name must not be null.") - - -# ReferenceMap -# ------------------------------------------------------------------------- # -def validate_map_has_unique_reference_genome(annotations): - # TODO - # check the type of annotations - # add description to annotations parameter - """ - This function validates whether or not each map in annotations has a - unique reference genome and raises an error if this is not the case. - - Parameters - __________ - annotations : - - Raises - ______ - ValidationError - If each reference map does not specify a different reference genome. - """ - genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) - if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) - - -def validate_map_has_at_least_one_interval(reference_map): - """ - This function validates that a reference map has at least one interval and raises an error - if this is not the case. - - Parameters - __________ - reference_map : - Reference map. - - Raises - ______ - ValidationError - If the reference_map does not have at least one interval. - """ - if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) - - -def validate_at_least_one_map(reference_maps): - """ - This function validates whether a target has at least one reference map specified - and raises an error if it does not. - - Parameters - __________ - reference_maps : - - - Raises - ______ - ValidationError - If the target does not have at least one reference map specified. - """ - if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) - - -def validate_one_primary_map(reference_maps): - """ - This function validates the existence of one primary reference map and raises an error - if it does not exist. - - Parameters - __________ - reference_maps : - - Raises - ______ - ValidationError - If target has less than or more than one primary reference map. - """ - primary_count = sum(a.is_primary_reference_map() for a in reference_maps) - if primary_count > 1 or primary_count < 1: - raise ValidationError("A target must have one primary reference map.") - - -# TargetGene -# ------------------------------------------------------------------------- # -def validate_gene_name(gene_name): - # TODO - # confirm gene_name type - """ - This function checks to see if a gene name is null and raises and error if it is. - - Parameters - __________ - gene_name : str - The gene name. - - Raises - ______ - ValidationError - If gene name (value parameter) is null. - """ - if is_null(gene_name): - raise ValidationError("Gene name must not be null.") diff --git a/mavecore/original_validation/metadata_validators.py b/mavecore/original_validation/metadata_validators.py deleted file mode 100644 index 3c9d5d1..0000000 --- a/mavecore/original_validation/metadata_validators.py +++ /dev/null @@ -1,203 +0,0 @@ -import idutils - -from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import null_values_re - - -def is_null(value): - # TODO - # check that parameter type is accurate - """ - This function checks that the passed value is null. - - Parameters - __________ - value : str - Value to be checked if null. - - Returns - _______ - bool - True if a stripped/lowercase value in in `nan_col_values`. - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -def validate_sra_identifier(identifier): - if not ( - idutils.is_sra(identifier) - or idutils.is_bioproject(identifier) - or idutils.is_geo(identifier) - or idutils.is_arrayexpress_array(identifier) - or idutils.is_arrayexpress_experiment(identifier) - ): - raise ValidationError( - f"'{identifier} is not a valid SRA, GEO, ArrayExpress or BioProject " - "accession." - ) - - -def validate_keyword(kw): - """ - This function validates whether or not the kw parameter is valid by - checking that it is a string that is not null. If kw is null - or is not a string, an error is raised. - - Parameters - __________ - kw : str - The keyword to be validated. - - Raises - ______ - ValidationError - If the kw argument is not a valid string. - """ - if is_null(kw) or not isinstance(kw, str): - raise ValidationError( - f"'{kw}' not a valid keyword. Keywords must be valid strings." - ) - - -def validate_pubmed_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_pmid(identifier): - raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") - - -def validate_doi_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_doi(identifier): - raise ValidationError(f"'{identifier}' is not a valid DOI.") - - -def validate_ensembl_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_ensembl(identifier): - raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") - - -def validate_uniprot_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_uniprot(identifier): - raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") - - -def validate_refseq_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_refseq(identifier): - raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") - - -def validate_genome_identifier(identifier): - """ - - :param identifier: - :return: - """ - if not idutils.is_genome(identifier): - raise ValidationError( - f"'{identifier}' is not a valid GenBank or RefSeq genome assembly." - ) - - -def validate_keyword_list(values): - """ - This function takes a list of keyword values and validates that each one is valid. - A valid keyword is a non-null string. The validate_keyword function will raise an - ValidationError if any of the keywords are invalid. - - Parameters - __________ - values : list[str] - The list of values to be validated. - """ - for value in values: - if not is_null(value): - validate_keyword(value) - - -def validate_pubmed_list(values): - """ - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_pubmed_identifier(value) - - -def validate_sra_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_sra_identifier(value) - - -def validate_doi_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_doi_identifier(value) - - -def validate_ensembl_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_ensembl_identifier(value) - - -def validate_refseq_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_refseq_identifier(value) - - -def validate_uniprot_list(values): - """ - - :param values: - :return: - """ - for value in values: - if not is_null(value): - validate_uniprot_identifier(value) diff --git a/mavecore/original_validation/urn_validators.py b/mavecore/original_validation/urn_validators.py deleted file mode 100644 index f81b8fd..0000000 --- a/mavecore/original_validation/urn_validators.py +++ /dev/null @@ -1,153 +0,0 @@ -import re -from mavecore.validation.exceptions import ValidationError - -MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 -MAVEDB_TMP_URN_DIGITS = 16 -MAVEDB_URN_MAX_LENGTH = 64 -MAVEDB_URN_NAMESPACE = "mavedb" - - -# Temp URN patterns -# --------------------------------------------------------------------------- # -MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( - width=MAVEDB_TMP_URN_DIGITS -) -MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) - - -# Experimentset Pattern/Compiled RE -MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( - namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS -) -MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) - -# Experiment Pattern/Compiled RE -MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( - pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] -) -MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) - -# Scoreset Pattern/Compiled RE -MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( - pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] -) -MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) - -# Variant Pattern/Compiled RE -MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( - pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] -) -MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) - -# Any Pattern/Compiled RE -MAVEDB_ANY_URN_PATTERN = "|".join( - [ - r"({pattern})".format(pattern=p) - for p in ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_VARIANT_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, - ) - ] -) -MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) - - -def validate_mavedb_urn(urn): - """ - This function validates a MaveDB urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB urn is not valid. - """ - if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError("%(urn)s is not a valid urn.", params={"urn": urn}) - - -def validate_mavedb_urn_experimentset(urn): - """ - This function validates a Experiment Set urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment Set urn to be validated. - - Raises - ______ - ValidationError - If the Experiment Set urn is not valid. - """ - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Experiment Set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_experiment(urn): - """ - This function validates an Experiment urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment urn to be validated. - - Raises - ______ - ValidationError - If the Experiemnt urn is not valid. - """ - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Experiment urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_scoreset(urn): - """ - This function validates a Scoreset urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Scoreset urn to be validated - - Raises - ______ - ValidationError - If the Scoreset urn is not valid. - """ - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid score set urn.", params={"urn": urn} - ) - - -def validate_mavedb_urn_variant(urn): - """ - This function validates a MaveDB Variant urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB Variant urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB Variant urn is not valid. - """ - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "%(urn)s is not a valid Variant urn.", params={"urn": urn} - ) diff --git a/mavecore/original_validation/validate.py b/mavecore/original_validation/validate.py deleted file mode 100644 index b138c9a..0000000 --- a/mavecore/original_validation/validate.py +++ /dev/null @@ -1,69 +0,0 @@ -from mavecore.validation import dataset_validators - - -def validate_all(countfile=None, scorefile=None, scorejson=None): - """ - By calling other helper functions, this function runs all of the validation code. - - Parameters - __________ - countfile : - scorefile : - scorejson : - - """ - validate_dataset(countfile, scorefile, scorejson) - - -def validate_dataset(countfile=None, scorefile=None, scorejson=None): - """ - This function calls all of the validation functions within - mavetools/mavetools/validation/dataset_validation.py - - Parameters - __________ - countfile : - scorefile : - scorejson : - - Returns - ------- - - """ - - # how to incorporate word limit validator? - - if scorefile is not None: - # open scorefile - open(scorefile) - # this one returns header - scoreheader = dataset_validators.read_header_from_io(file=scorefile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=scoreheader) - dataset_validators.validate_at_least_one_additional_column(header=scoreheader) - dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) - - dataset_validators.validate_scoreset_score_data_input(file=scorefile) - - if scorejson is not None: - # open scorejson - open(scorejson) - dataset_validators.validate_scoreset_json(dict_=scorejson) - - if countfile is not None: - # open countfile - open(countfile) - countheader = dataset_validators.read_header_from_io(file=countfile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=countheader) - dataset_validators.validate_at_least_one_additional_column(header=countheader) - dataset_validators.validate_header_contains_no_null_columns(header=countheader) - - dataset_validators.validate_scoreset_count_data_input(file=countfile) - - if scorefile is not None and countfile is not None: - dataset_validators.validate_datasets_define_same_variants( - scores=scorefile, counts=countfile - ) diff --git a/mavecore/original_validation/variant_validators/__init__.py b/mavecore/original_validation/variant_validators/__init__.py deleted file mode 100644 index 1f7aca1..0000000 --- a/mavecore/original_validation/variant_validators/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset - -from .hgvs import ( - validate_nt_variant, - validate_pro_variant, - validate_splice_variant, - validate_hgvs_string, -) - -from .variant import validate_columns_match, validate_variant_json - -__all__ = [ - "dataset", - "variant", - "hgvs", - "validate_nt_variant", - "validate_splice_variant", - "validate_pro_variant", - "validate_hgvs_string", - "validate_columns_match", - "validate_variant_json", - "MaveCountsDataset", - "MaveScoresDataset", - "MaveDataset", -] diff --git a/mavecore/original_validation/variant_validators/dataset.py b/mavecore/original_validation/variant_validators/dataset.py deleted file mode 100644 index 9461a0e..0000000 --- a/mavecore/original_validation/variant_validators/dataset.py +++ /dev/null @@ -1,1012 +0,0 @@ -import re -from collections import defaultdict -from io import StringIO -from itertools import groupby -from operator import itemgetter -from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict - -import pandas as pd -import numpy as np -from mavehgvs import MaveHgvsParseError, Variant -from fqfa.util.translate import translate_dna -from fqfa.util.infer import infer_sequence_type - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, - required_score_column, - null_values_list, - null_values_re, - readable_null_values_list, -) - - -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value : - - Returns - _______ - bool - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -class MaveDataset: - """ """ - - class DatasetType: - """ """ - - SCORES = "scores" - COUNTS = "counts" - - class HGVSColumns: - """ """ - - NUCLEOTIDE: str = hgvs_nt_column - TRANSCRIPT: str = hgvs_splice_column - PROTEIN: str = hgvs_pro_column - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] - - class AdditionalColumns: - """ """ - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [] - - # ---------------------- Construction------------------------------------ # - @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveScoresDataset` - - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) - - @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveCountsDataset` - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) - - @classmethod - def _for_type( - cls, file: Union[str, TextIO, BinaryIO], dataset_type: str - ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - dataset_type : str - - Returns - _______ - Union[`MaveScoreDataset`, `MaveCountsDataset`] - - Raises - ______ - TypeError - If file parameter is not expected file path or buffer object. - ValueError - If dataset_type parameter is not a recognized dataset type. - """ - if isinstance(file, str): - handle = file - elif hasattr(file, "read"): - file_contents = file.read() - if hasattr(file_contents, "decode"): - file_contents = file_contents.decode("utf-8") - file_contents = file_contents.strip() - handle = StringIO(file_contents) - else: - raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" - ) - - extra_na_values = set( - list(null_values_list) - + [str(x).lower() for x in null_values_list] - + [str(x).upper() for x in null_values_list] - + [str(x).capitalize() for x in null_values_list] - ) - - df = pd.read_csv( - filepath_or_buffer=handle, - sep=",", - encoding="utf-8", - quotechar='"', - comment="#", - na_values=extra_na_values, - keep_default_na=True, - dtype={ - **{c: str for c in cls.HGVSColumns.options()}, - MaveScoresDataset.AdditionalColumns.SCORES: float, - }, - ).replace(null_values_re, np.NaN) - - if dataset_type == cls.DatasetType.SCORES: - return MaveScoresDataset(df) - elif dataset_type == cls.DatasetType.COUNTS: - return MaveCountsDataset(df) - else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") - - # ---------------------- Public ----------------------------------------- # - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "dataset" - - @property - def is_valid(self) -> Optional[bool]: - """ - - Returns - _______ - Optional[bool] - """ - if self._errors is None: - return None - return len(self._errors) == 0 - - @property - def n_errors(self) -> Optional[int]: - """ - - Returns - _______ - Optional[int] - """ - if self._errors is None: - return None - return len(self._errors) - - @property - def errors(self) -> Optional[List[str]]: - """ - - Returns - _______ - Optional[List[str]] - """ - return self._errors - - @property - def is_empty(self) -> bool: - """ - - Returns - _______ - bool - """ - return self._df.empty - - @property - def columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return list(self._df.columns) - - @property - def hgvs_columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c in self.HGVSColumns.options()] - - @property - def non_hgvs_columns(self) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c not in self.HGVSColumns.options()] - - @property - def n_rows(self) -> int: - """ - - Returns - _______ - int - """ - return len(self._df) - - @property - def n_columns(self) -> int: - """ - - Returns - _______ - int - """ - return len(self.columns) - - @property - def index_column(self) -> Optional[str]: - """ - - Returns - _______ - Optional[str] - """ - if self._errors: - return None - return self._index_column - - @property - def index(self) -> Optional[pd.Index]: - """ - - Returns - _______ - Optional[`pd.Index`] - """ - if self._errors: - return None - return self._df.index.copy(deep=True) - - def data(self, serializable=False) -> pd.DataFrame: - """ - Return underlying dataframe object. - - Parameters - ---------- - serializable: bool - Replaces `np.NaN` with `None` for JSON compatibility. - - Returns - _______ - `pd.DataFrame` - - """ - if serializable: - # need to force "object" type to allow None values - return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) - return return_df - return self._df.copy(deep=True) - - def match_other(self, other: "MaveDataset") -> Optional[bool]: - """ - Check that each dataset defined the same variants in each column. - - Parameters - ---------- - other: MaveDataset - Validator instance to match against. - - Returns - ------- - Optional[bool] - A boolean indicating index match, otherwise `None` if either instance - is not valid. - """ - if (not self.is_valid) or (not other.is_valid): - return None - - if self.index_column != other.index_column: - return False - - return all( - self._df[column].equals(other._df[column]) - for column in self.HGVSColumns.options() - ) - - def to_dict(self) -> Dict[str, Dict]: - """ - Returns underlying dataframe as dictionary in 'records' orientation. - Keys will be index values and values will be an inner dictionary mapping - column names to row values for said index. - - Returns - _______ - Dict[str, Dict] - """ - # Convert np.NaN values to None for consistency across all columns and - # for compatibility in PostgresSQL queries. Replaces all values which - # are considered null by pandas with None by masking pd.notnull cells. - - return self.data(serializable=True).to_dict(orient="index") - - def validate( - self, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - allow_index_duplicates: bool = False, - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - allow_index_duplicates : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - - self._errors = [] - self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) - self._index_column = None - - self._validate_columns() - # Only attempt to validate variants if columns are valid - if not self._errors: - ( - self._normalize_data() - ._validate_genomic_variants(targetseq, relaxed_ordering) - ._validate_transcript_variants(targetseq, relaxed_ordering) - ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) - ) - - if self.is_empty: - self._errors.append( - f"No variants could be parsed from your {self.label} file. " - f"Please upload a non-empty file." - ) - return self - - if not self._errors: - # Set index last as original index is used when indicating duplicate - # hgvs string row numbers in the column name used as the index ( - # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). - self._df.index = pd.Index(self._df[self.index_column]) - - return self - - # ---------------------- Private ---------------------------------------- # - def __init__( - self, - df: Optional[pd.DataFrame] = None, - index_column: Optional[str] = None, - errors: Optional[List[str]] = None, - ): - """ - - Parameters - df : - index_column : - errors : - - Raises - ______ - - """ - self._df: pd.DataFrame = pd.DataFrame() if df is None else df - self._index_column = index_column or None - self._errors = None if errors is None else list(errors) - - def __repr__(self): - """ - - Returns - _______ - - """ - return ( - f"<" - f"{self.__class__.__name__} " - f"columns={self.columns} " - f"index={self.index_column} " - f"valid={self.is_valid}" - f">" - ) - - @property - def _column_order(self) -> Dict[str, int]: - """ - - Returns - _______ - Dict[str, int] - """ - return defaultdict( - lambda: 100, - { - self.HGVSColumns.NUCLEOTIDE: 0, - self.HGVSColumns.TRANSCRIPT: 1, - self.HGVSColumns.PROTEIN: 2, - **{ - c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) - }, - }, - ) - - def _validate_columns(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - # Pandas will automatically name blank columns using the pattern below - unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) - columns = self.columns - if any(is_null(h) or unnamed.match(h) for h in columns): - self._errors.append( - f"Column names in your {self.label} file cannot values " - f"considered null such as the following: " - f"{', '.join(readable_null_values_list)}" - ) - - columns = [c for c in columns if not is_null(c)] - if len(columns) < 1: - self._errors.append( - f"No columns could not be parsed from your {self.label} file. " - "Make sure columns are comma delimited. Column names with " - "commas must be escaped by enclosing them in double quotes" - ) - - required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} - if not (set(columns) & required): - self._errors.append( - f"Your {self.label} file must define either a nucleotide " - f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " - f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " - f"Columns are case-sensitive and must be comma delimited" - ) - - if not (set(columns) - set(self.HGVSColumns.options())): - self._errors.append( - f"Your {self.label} file must define at least one additional " - f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " - f"'{self.HGVSColumns.TRANSCRIPT}' and " - f"'{self.HGVSColumns.PROTEIN}'" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - """ - if self._errors: - return self - - # Initialize missing hgvs columns as empty. - for c in self.HGVSColumns.options(): - if c not in self.columns: - self._df[c] = np.NaN - - column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) - - self._df = self._df[sorted_columns] - return self - - def _validate_genomic_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): - return self - - defines_transcript_variants = not self._column_is_null( - self.HGVSColumns.TRANSCRIPT - ) - validated_variants, prefixes, errors = self._validate_variants( - column=self.HGVSColumns.NUCLEOTIDE, - splice_defined=defines_transcript_variants, - targetseq=targetseq, - relaxed_ordering=relaxed_ordering, - ) - - if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: - self._errors.append( - f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " - f"(prefix 'g.') cannot be mixed with transcript variants " - f"(prefix 'c.' or 'n.')" - ) - - if prefixes == {"g"} and not defines_transcript_variants: - self._errors.append( - f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " - f"are required when specifying genomic variants " - f"(prefix 'g.' in the 'hgvs_nt' column)" - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants - - self._index_column = self.HGVSColumns.NUCLEOTIDE - return self - - def _validate_transcript_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_tx and (not defines_nt): - self._errors.append( - f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " - f"must be defined when specifying transcript " - f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" - ) - - if not defines_tx: - return self - - # Don't validate transcript variants against sequence. Might come - # back to this later with research into implementing gene models. - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.TRANSCRIPT, - targetseq=None, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants - - return self - - def _validate_protein_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.PROTEIN): - return self - - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_splice: - protein_seq = None - else: - protein_seq = targetseq - if targetseq and "dna" in infer_sequence_type(targetseq).lower(): - protein_seq, remainder = translate_dna(targetseq) - if remainder: - self._errors.insert( - 0, - "Protein variants could not be validated because the " - "length of your target sequence is not a multiple of 3", - ) - - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.PROTEIN, - targetseq=protein_seq, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.PROTEIN] = validated_variants - - if not defines_nt: - self._index_column = self.HGVSColumns.PROTEIN - - return self - - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - """ - - Parameters - __________ - allow_duplicates : bool - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - if self._index_column is None: - self._index_column = self.HGVSColumns.NUCLEOTIDE - - if self._column_is_partially_null(self._index_column): - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"cannot contain any null values from " - f"{', '.join(readable_null_values_list)} (case-insensitive)" - ) - - if not allow_duplicates: - dupes = self._df[self._index_column].duplicated(keep=False) - if np.any(dupes): - dup_list = zip( - self._df.loc[dupes, self._index_column], dupes.index[dupes] - ) - dupes_str = ", ".join( - f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers - for (v, groups) in groupby(dup_list, key=itemgetter(0)) - ) - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"contains duplicate HGVS variants: {dupes_str}" - ) - - return self - - def _validate_variants( - self, - column: str, - splice_defined: Optional[bool] = None, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - ) -> Tuple[pd.Series, Set[str], List[str]]: - """ - - Parameters - __________ - column : str - splice_defined : Optional[bool] - targetseq : Optional[str] - relaxed_ordering : bool - - Returns - _______ - Tuple[`pd.Series`, Set[str], List[str]] - - Raises - ______ - - """ - - prefixes = set() - errors = [] - - def validate_variant(variant: str): - # TODO: logic mirrors that in validate_hgvs_string, which is kept - # as a standalone function for backwards compatibility with - # django's model validator field. Merge at some point. - - if is_null(variant): - return np.NaN - else: - try: - if variant.lower() == "_sy": - errors.append( - "'_sy' is no longer supported and should be " - "replaced by 'p.(=)'" - ) - return variant - elif variant.lower() == "_wt": - errors.append( - "'_wt' is no longer supported and should be " - "replaced by one of 'g.=', 'c.=' or 'n.='" - ) - return variant - - validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - prefix = validated.prefix.lower() - prefixes.add(prefix) - - prefix_error = self._validate_variant_prefix_for_column( - variant=validated, - prefix=validated.prefix, - column=column, - splice_defined=splice_defined, - ) - if prefix_error: - errors.append(prefix_error) - - return str(validated) - - except MaveHgvsParseError as error: - errors.append(f"{variant}: {str(error)}") - return np.NaN - - validated_variants = self._df[column].apply(validate_variant) - - return validated_variants, prefixes, errors - - def _column_is_null(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == len(self._df) - - def _column_is_partially_null(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return 0 < len(self._df[self._df[column].isna()]) < len(self._df) - - def _column_is_fully_specified(self, column) -> bool: - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == 0 - - def _validate_variant_prefix_for_column( - self, variant: Variant, prefix: str, column: str, splice_defined: bool - ) -> Optional[str]: - """ - - Parameters - __________ - variant : Variant - prefix : str - column : str - splice_defined : bool - - Returns - _______ - Optional[str] - - Raises - ______ - ValueError - If there is an unknown column as column argument. - """ - prefix = prefix.lower() - - if column == self.HGVSColumns.NUCLEOTIDE: - if splice_defined: - if prefix not in "g": - return ( - f"{column}: " - f"'{variant}' is not a genomic variant " - f"(prefix 'g.'). Nucleotide variants must " - f"be genomic if transcript variants are " - f"also present" - ) - else: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. " - f"The accepted transcript variant prefixes " - f"are 'c.' or 'n.'" - ) - elif column == self.HGVSColumns.TRANSCRIPT: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. The " - f"accepted transcript variant prefixes are " - f"'c.' or 'n.'" - ) - elif column == self.HGVSColumns.PROTEIN: - if prefix not in "p": - return ( - f"{column}: " - f"'{variant}' is not a protein variant. " - f"The accepted protein variant prefix is 'p.'" - ) - else: - raise ValueError( - f"Unknown column '{column}'. Expected one " - f"of {', '.join(self.HGVSColumns.options())}" - ) - - return None - - -class MaveScoresDataset(MaveDataset): - """ """ - - class AdditionalColumns: - """ """ - - SCORES = required_score_column - - @classmethod - def options(cls) -> List[str]: - """ - - Returns - _______ - List[str] - """ - return [cls.SCORES] - - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "scores" - - def _validate_columns(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - super()._validate_columns() - - if self.AdditionalColumns.SCORES not in self.columns: - self._errors.append( - f"Your scores dataset is missing the " - f"'{self.AdditionalColumns.SCORES}' column. " - f"Columns are case-sensitive and must be comma delimited" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - ValueError - - """ - super()._normalize_data() - - should_be_numeric = [self.AdditionalColumns.SCORES] - for c in should_be_numeric: - if c in self.columns: - try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") - except ValueError as e: - self._errors.append(f"{c}: {str(e)}") - - return self - - -class MaveCountsDataset(MaveDataset): - """ """ - - @property - def label(self) -> str: - """ - - Returns - _______ - str - """ - return "counts" diff --git a/mavecore/original_validation/variant_validators/hgvs.py b/mavecore/original_validation/variant_validators/hgvs.py deleted file mode 100644 index 3f0c043..0000000 --- a/mavecore/original_validation/variant_validators/hgvs.py +++ /dev/null @@ -1,134 +0,0 @@ -from functools import partial -from typing import Optional, Union - -from mavehgvs import Variant, MaveHgvsParseError -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.constants import NA_STRING, null_values_re - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, -) - - -# from core.utilities import is_null -def is_null(value): - """ - Returns True if a stripped/lowercase value in in `nan_col_values`. - - Parameters - __________ - value - - Returns - _______ - - """ - value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value - - -def validate_hgvs_string( - value: Union[str, bytes], - column: Optional[str] = None, - splice_present: bool = False, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, -) -> Optional[str]: - """ - - Parameters - __________ - value : Union[str, bytes] - column : Optional[str] = None - splice_present : - targetseq : - relaxed_ordering : - - Returns - _______ - - Raises - ______ - ValidationError - If variant HGVS input values are not strings. - ValidationError - If value is _sy or _wt, which are no longer supported. - ValidationError - If - ValidationError - If value is not a genomic variant (prefix 'g.'). Nucleotide variants must - be genomic if transcript variants are also defined. - ValidationError - If value is not a transcript variant. The accepted transcript variant - prefixes are 'c.', 'n.'. - ValidationError - If value is not a protein variant. The accepted protein variant prefix is 'p.'. - ValueError - If there exists an unknown column. Function expects nt, splice or p." - """ - if is_null(value): - return None - - if hasattr(value, "decode"): - value = value.decode() - if not isinstance(value, str): - raise ValidationError( - "Variant HGVS values input must be strings. " - "'{}' has the type '{}'.".format(value, type(value).__name__) - ) - - if value.lower() == "_sy": - raise ValidationError( - "_sy is no longer supported and should be replaced by p.(=)" - ) - elif value.lower() == "_wt": - raise ValidationError( - "_wt is no longer supported and should be replaced by (cgnp).=" - ) - - try: - variant = Variant( - s=value, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - except MaveHgvsParseError as error: - raise ValidationError(f"{value}: {str(error)}") - - prefix = variant.prefix.lower() - if column in ("nt", hgvs_nt_column): - if splice_present: - if prefix not in "g": - raise ValidationError( - f"'{value}' is not a genomic variant (prefix 'g.'). " - f"Nucleotide variants must be genomic if transcript " - f"variants are also defined." - ) - else: - if prefix not in "cn": - raise ValidationError( - f"'{value}' is not a transcript variant. The accepted " - f"transcript variant prefixes are 'c.', 'n.'." - ) - elif column in ("splice", hgvs_splice_column): - if prefix not in "cn": - raise ValidationError( - f"'{value}' is not a transcript variant. The accepted " - f"transcript variant prefixes are 'c.', 'n.'." - ) - elif column in ("p", hgvs_pro_column): - if prefix not in "p": - raise ValidationError( - f"'{value}' is not a protein variant. The accepted " - f"protein variant prefix is 'p.'." - ) - else: - raise ValueError("Unknown column '{}'. Expected nt, splice or p".format(column)) - - return str(variant) - - -validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) -validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) -validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) diff --git a/mavecore/original_validation/variant_validators/variant.py b/mavecore/original_validation/variant_validators/variant.py deleted file mode 100644 index bf00e71..0000000 --- a/mavecore/original_validation/variant_validators/variant.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Dict - -from mavecore.validation.constants import ( - variant_score_data, - variant_count_data, - required_score_column, -) -from mavecore.validation.exceptions import ValidationError - - -def validate_columns_match(variant, scoreset) -> None: - # TODO - # document errors correctly, note key error - """ - Validate that a child matches parents defined columns to keep - data in sync. - - Parameters - __________ - variant : - scoreset : - - Raises - ______ - ValidationError - If variant score columns do not match scoreset score columns. - ValidationError - If variant count columns do not match scoreset count columns. - """ - try: - if variant.score_columns != scoreset.score_columns: - raise ValidationError( - f"Variant defines score columns '{variant.score_columns}' " - f"but parent defines columns '{scoreset.score_columns}. " - ) - if variant.count_columns != scoreset.count_columns: - raise ValidationError( - f"Variant defines count columns '{variant.count_columns}' " - f"but parent defines columns '{scoreset.count_columns}. " - ) - except KeyError as error: - raise ValidationError(f"Missing key {str(error)}") - - -def validate_variant_json(data: Dict[str, Dict]) -> None: - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `data` attribute in a :class:`Variant` instance. - - Parameters - ---------- - data : dict[str, dict] - Dictionary of keys mapping to a list. - - Raises - ______ - ValidationError - If missing the required key. - ValidationError - If missing the required column in variant's score data. - ValidationError - If encountered unexpected keys. - ValidationError - If value for key is not of type dict. - """ - expected_keys = [variant_score_data, variant_count_data] - for key in expected_keys: - if key not in data.keys(): - raise ValidationError(f"Missing the required key {key}") - - if required_score_column not in data[variant_score_data]: - raise ValidationError( - f"Missing required column '{required_score_column}' in variant's score data." - ) - - extras = [k for k in data.keys() if k not in set(expected_keys)] - if len(extras) > 0: - extras = [k for k in data.keys() if k not in expected_keys] - raise ValidationError("Encountered unexpected keys {extras}") - - # Check the correct data types are given. - for key in expected_keys: - if not isinstance(data[key], dict): - type_ = type(data[key]).__name__ - raise ValidationError(f"Value for '{key}' must be a dict not {type_}.") From 04eedd324749fe9d67abc0bef7cd5b61cca6357d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 20:20:44 -0700 Subject: [PATCH 355/877] edit TODO message --- mavecore/validation/dataset_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation/dataset_validators.py index 330772d..3ac8d7b 100644 --- a/mavecore/validation/dataset_validators.py +++ b/mavecore/validation/dataset_validators.py @@ -77,7 +77,7 @@ def __call__(self, value): def read_header_from_io(file, label=None, msg=None): # TODO - # confirm types for parameters + # confirm type for the file parameter """ This takes a file and reads the header from that file. From e0d78fdb44b3a7597414d5f1a5ac201be65f7579 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:12:36 -0700 Subject: [PATCH 356/877] add comments to keep track of Django independent code --- mavecore/validation/genome_validators.py | 26 +++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py index c8f0c40..ffa259e 100644 --- a/mavecore/validation/genome_validators.py +++ b/mavecore/validation/genome_validators.py @@ -26,7 +26,7 @@ # ) -class WildTypeSequence: +class WildTypeSequence:#g """ Basic model specifying a wild-type sequence. @@ -40,7 +40,7 @@ class WildTypeSequence: Protein sequence (amino acids) or DNA (nucleotides) """ - class SequenceType: + class SequenceType:#g """ """ DNA = "dna" @@ -48,7 +48,7 @@ class SequenceType: INFER = "infer" @classmethod - def detect_sequence_type(cls, sequence): + def detect_sequence_type(cls, sequence):#g # TODO # confirm sequence parameter type """ @@ -80,7 +80,7 @@ def detect_sequence_type(cls, sequence): ) @classmethod - def is_protein(cls, value): + def is_protein(cls, value):#g """ Parameters @@ -94,7 +94,7 @@ def is_protein(cls, value): return value == cls.PROTEIN @classmethod - def is_dna(cls, value): + def is_dna(cls, value): #g """ Parameters @@ -108,7 +108,7 @@ def is_dna(cls, value): return value == cls.DNA @classmethod - def choices(cls): + def choices(cls):#g """ Returns @@ -116,13 +116,13 @@ def choices(cls): """ return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - class Meta: + class Meta:#g """ """ verbose_name = "Reference sequence" verbose_name_plural = "Reference sequences" - def __str__(self): + def __str__(self):#g """ Returns @@ -155,7 +155,7 @@ def is_dna(self): _______ """ - return self.__class__.SequenceType.is_dna(self.sequence_type) + return #self.__class__.SequenceType.is_dna(self.sequence_type) @property def is_protein(self): @@ -165,7 +165,7 @@ def is_protein(self): _______ """ - return self.__class__.SequenceType.is_protein(self.sequence_type) + return #self.__class__.SequenceType.is_protein(self.sequence_type) def save(self, *args, **kwargs): """ @@ -371,7 +371,7 @@ def validate_wildtype_sequence(seq, as_type="any"): ) -def sequence_is_dna(seq): +def sequence_is_dna(seq):#g """ This function checks if seq is a DNA sequence. @@ -392,7 +392,7 @@ def sequence_is_dna(seq): return dna_bases_validator(seq) is not None -def sequence_is_protein(seq): +def sequence_is_protein(seq):#g """ This function check if seq is a protein sequence. @@ -418,8 +418,6 @@ def sequence_is_protein(seq): # ReferenceGenome # ------------------------------------------------------------------------- # def validate_organism_name(organism_name): - # TODO - # confirm organism_name type """ This function validates the organism name by checking that the name is not null. From 089194211a010fe8b686e2b8f5200a2873be4b2b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:17:06 -0700 Subject: [PATCH 357/877] delete Django dataset validators --- .../validation/variant_validators/dataset.py | 1038 ----------------- 1 file changed, 1038 deletions(-) delete mode 100644 mavecore/validation/variant_validators/dataset.py diff --git a/mavecore/validation/variant_validators/dataset.py b/mavecore/validation/variant_validators/dataset.py deleted file mode 100644 index 2d5fa33..0000000 --- a/mavecore/validation/variant_validators/dataset.py +++ /dev/null @@ -1,1038 +0,0 @@ -# TODO Django dependent, whole file will need to be refactored -import re -from collections import defaultdict -from io import StringIO -from itertools import groupby -from operator import itemgetter -from typing import Union, Optional, Tuple, List, TextIO, BinaryIO, Set, Dict - -import pandas as pd -import numpy as np -from mavehgvs import MaveHgvsParseError, Variant -from fqfa.util.translate import translate_dna -from fqfa.util.infer import infer_sequence_type - -from mavecore.validation.constants import ( - hgvs_nt_column, - hgvs_splice_column, - hgvs_pro_column, - required_score_column, - null_values_list, - null_values_re, - readable_null_values_list, -) - -from mavecore.validation.utilities import is_null - - -class MaveDataset: - # TODO Django dependent - - class DatasetType: - # TODO - """ """ - SCORES = "scores" - COUNTS = "counts" - - class HGVSColumns: - # TODO - """ """ - NUCLEOTIDE: str = hgvs_nt_column - TRANSCRIPT: str = hgvs_splice_column - PROTEIN: str = hgvs_pro_column - - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [cls.NUCLEOTIDE, cls.TRANSCRIPT, cls.PROTEIN] - - class AdditionalColumns: - # TODO Django dependent - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [] - - # ---------------------- Construction------------------------------------ # - @classmethod - def for_scores(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveScoresDataset": - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveScoresDataset` - - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.SCORES) - - @classmethod - def for_counts(cls, file: Union[str, TextIO, BinaryIO]) -> "MaveCountsDataset": - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - - Returns - _______ - `MaveCountsDataset` - """ - return cls._for_type(file=file, dataset_type=cls.DatasetType.COUNTS) - - @classmethod - def _for_type( - cls, file: Union[str, TextIO, BinaryIO], dataset_type: str - ) -> Union["MaveScoresDataset", "MaveCountsDataset"]: - # TODO Django dependent - """ - - Parameters - __________ - file : Union[str, TextIO, BinaryIO] - dataset_type : str - - Returns - _______ - Union[`MaveScoreDataset`, `MaveCountsDataset`] - - Raises - ______ - TypeError - If file parameter is not expected file path or buffer object. - ValueError - If dataset_type parameter is not a recognized dataset type. - """ - if isinstance(file, str): - handle = file - elif hasattr(file, "read"): - file_contents = file.read() - if hasattr(file_contents, "decode"): - file_contents = file_contents.decode("utf-8") - file_contents = file_contents.strip() - handle = StringIO(file_contents) - else: - raise TypeError( - f"Expected file path or buffer object. " f"Got '{type(file).__name__}'" - ) - - extra_na_values = set( - list(null_values_list) - + [str(x).lower() for x in null_values_list] - + [str(x).upper() for x in null_values_list] - + [str(x).capitalize() for x in null_values_list] - ) - - df = pd.read_csv( - filepath_or_buffer=handle, - sep=",", - encoding="utf-8", - quotechar='"', - comment="#", - na_values=extra_na_values, - keep_default_na=True, - dtype={ - **{c: str for c in cls.HGVSColumns.options()}, - MaveScoresDataset.AdditionalColumns.SCORES: float, - }, - ).replace(null_values_re, np.NaN) - - if dataset_type == cls.DatasetType.SCORES: - return MaveScoresDataset(df) - elif dataset_type == cls.DatasetType.COUNTS: - return MaveCountsDataset(df) - else: - raise ValueError(f"'{dataset_type}' is not a recognised dataset type.") - - # ---------------------- Public ----------------------------------------- # - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "dataset" - - @property - def is_valid(self) -> Optional[bool]: - # TODO Django dependent - """ - - Returns - _______ - Optional[bool] - """ - if self._errors is None: - return None - return len(self._errors) == 0 - - @property - def n_errors(self) -> Optional[int]: - # TODO Django dependent - """ - - Returns - _______ - Optional[int] - """ - if self._errors is None: - return None - return len(self._errors) - - @property - def errors(self) -> Optional[List[str]]: - # TODO Django dependent - """ - - Returns - _______ - Optional[List[str]] - """ - return self._errors - - @property - def is_empty(self) -> bool: - # TODO Django dependent - """ - - Returns - _______ - bool - """ - return self._df.empty - - @property - def columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return list(self._df.columns) - - @property - def hgvs_columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c in self.HGVSColumns.options()] - - @property - def non_hgvs_columns(self) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [c for c in self.columns if c not in self.HGVSColumns.options()] - - @property - def n_rows(self) -> int: - # TODO Django dependent - """ - - Returns - _______ - int - """ - return len(self._df) - - @property - def n_columns(self) -> int: - # TODO Django dependent - """ - - Returns - _______ - int - """ - return len(self.columns) - - @property - def index_column(self) -> Optional[str]: - # TODO Django dependent - """ - - Returns - _______ - Optional[str] - """ - if self._errors: - return None - return self._index_column - - @property - def index(self) -> Optional[pd.Index]: - # TODO Django dependent - """ - - Returns - _______ - Optional[`pd.Index`] - """ - if self._errors: - return None - return self._df.index.copy(deep=True) - - def data(self, serializable=False) -> pd.DataFrame: - # TODO Django dependent - """ - Return underlying dataframe object. - - Parameters - ---------- - serializable: bool - Replaces `np.NaN` with `None` for JSON compatibility. - - Returns - _______ - `pd.DataFrame` - - """ - if serializable: - # need to force "object" type to allow None values - return_df = self._df.astype(object, copy=True) - return_df.where(cond=pd.notnull(return_df), other=None, inplace=True) - return return_df - return self._df.copy(deep=True) - - def match_other(self, other: "MaveDataset") -> Optional[bool]: - # TODO Django dependent - """ - Check that each dataset defined the same variants in each column. - - Parameters - ---------- - other: MaveDataset - Validator instance to match against. - - Returns - ------- - Optional[bool] - A boolean indicating index match, otherwise `None` if either instance - is not valid. - """ - if (not self.is_valid) or (not other.is_valid): - return None - - if self.index_column != other.index_column: - return False - - return all( - self._df[column].equals(other._df[column]) - for column in self.HGVSColumns.options() - ) - - def to_dict(self) -> Dict[str, Dict]: - # TODO Django dependent - """ - Returns underlying dataframe as dictionary in 'records' orientation. - Keys will be index values and values will be an inner dictionary mapping - column names to row values for said index. - - Returns - _______ - Dict[str, Dict] - """ - # Convert np.NaN values to None for consistency across all columns and - # for compatibility in PostgresSQL queries. Replaces all values which - # are considered null by pandas with None by masking pd.notnull cells. - - return self.data(serializable=True).to_dict(orient="index") - - def validate( - self, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - allow_index_duplicates: bool = False, - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - allow_index_duplicates : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - - self._errors = [] - self._df.index = pd.RangeIndex(start=0, stop=self.n_rows, step=1) - self._index_column = None - - self._validate_columns() - # Only attempt to validate variants if columns are valid - if not self._errors: - ( - self._normalize_data() - ._validate_genomic_variants(targetseq, relaxed_ordering) - ._validate_transcript_variants(targetseq, relaxed_ordering) - ._validate_protein_variants(targetseq, relaxed_ordering) - ._validate_index_column(allow_duplicates=allow_index_duplicates) - ) - - if self.is_empty: - self._errors.append( - f"No variants could be parsed from your {self.label} file. " - f"Please upload a non-empty file." - ) - return self - - if not self._errors: - # Set index last as original index is used when indicating duplicate - # hgvs string row numbers in the column name used as the index ( - # either hgvs_nt when present or hgvs_pro when hgvs_nt is absent). - self._df.index = pd.Index(self._df[self.index_column]) - - return self - - # ---------------------- Private ---------------------------------------- # - def __init__( - self, - df: Optional[pd.DataFrame] = None, - index_column: Optional[str] = None, - errors: Optional[List[str]] = None, - ): - # TODO Django dependent - """ - - Parameters - df : - index_column : - errors : - - Raises - ______ - - """ - self._df: pd.DataFrame = pd.DataFrame() if df is None else df - self._index_column = index_column or None - self._errors = None if errors is None else list(errors) - - def __repr__(self): - # TODO Django dependent - """ - - Returns - _______ - - """ - return ( - f"<" - f"{self.__class__.__name__} " - f"columns={self.columns} " - f"index={self.index_column} " - f"valid={self.is_valid}" - f">" - ) - - @property - def _column_order(self) -> Dict[str, int]: - # TODO Django dependent - """ - - Returns - _______ - Dict[str, int] - """ - return defaultdict( - lambda: 100, - { - self.HGVSColumns.NUCLEOTIDE: 0, - self.HGVSColumns.TRANSCRIPT: 1, - self.HGVSColumns.PROTEIN: 2, - **{ - c: (2 + i) - for (i, c) in enumerate(self.AdditionalColumns.options(), start=1) - }, - }, - ) - - def _validate_columns(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - # Pandas will automatically name blank columns using the pattern below - unnamed = re.compile(r"^Unnamed: \d+$", flags=re.IGNORECASE) - columns = self.columns - if any(is_null(h) or unnamed.match(h) for h in columns): - self._errors.append( - f"Column names in your {self.label} file cannot values " - f"considered null such as the following: " - f"{', '.join(readable_null_values_list)}" - ) - - columns = [c for c in columns if not is_null(c)] - if len(columns) < 1: - self._errors.append( - f"No columns could not be parsed from your {self.label} file. " - "Make sure columns are comma delimited. Column names with " - "commas must be escaped by enclosing them in double quotes" - ) - - required = {self.HGVSColumns.NUCLEOTIDE, self.HGVSColumns.PROTEIN} - if not (set(columns) & required): - self._errors.append( - f"Your {self.label} file must define either a nucleotide " - f"hgvs column '({self.HGVSColumns.NUCLEOTIDE})' " - f"or a protein hgvs column '({self.HGVSColumns.PROTEIN})'. " - f"Columns are case-sensitive and must be comma delimited" - ) - - if not (set(columns) - set(self.HGVSColumns.options())): - self._errors.append( - f"Your {self.label} file must define at least one additional " - f"column different from '{self.HGVSColumns.NUCLEOTIDE}', " - f"'{self.HGVSColumns.TRANSCRIPT}' and " - f"'{self.HGVSColumns.PROTEIN}'" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - """ - if self._errors: - return self - - # Initialize missing hgvs columns as empty. - for c in self.HGVSColumns.options(): - if c not in self.columns: - self._df[c] = np.NaN - - column_order = self._column_order - sorted_columns = list(sorted(self.columns, key=lambda x: column_order[x])) - - self._df = self._df[sorted_columns] - return self - - def _validate_genomic_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.NUCLEOTIDE): - return self - - defines_transcript_variants = not self._column_is_null( - self.HGVSColumns.TRANSCRIPT - ) - validated_variants, prefixes, errors = self._validate_variants( - column=self.HGVSColumns.NUCLEOTIDE, - splice_defined=defines_transcript_variants, - targetseq=targetseq, - relaxed_ordering=relaxed_ordering, - ) - - if ("c" in prefixes or "n" in prefixes) and "g" in prefixes: - self._errors.append( - f"{self.HGVSColumns.NUCLEOTIDE}: Genomic variants " - f"(prefix 'g.') cannot be mixed with transcript variants " - f"(prefix 'c.' or 'n.')" - ) - - if prefixes == {"g"} and not defines_transcript_variants: - self._errors.append( - f"Transcript variants ('{self.HGVSColumns.TRANSCRIPT}' column) " - f"are required when specifying genomic variants " - f"(prefix 'g.' in the 'hgvs_nt' column)" - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.NUCLEOTIDE] = validated_variants - - self._index_column = self.HGVSColumns.NUCLEOTIDE - return self - - def _validate_transcript_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_tx = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_tx and (not defines_nt): - self._errors.append( - f"Genomic variants ('{self.HGVSColumns.NUCLEOTIDE}' column) " - f"must be defined when specifying transcript " - f"variants ('{self.HGVSColumns.TRANSCRIPT}' column)" - ) - - if not defines_tx: - return self - - # Don't validate transcript variants against sequence. Might come - # back to this later with research into implementing gene models. - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.TRANSCRIPT, - targetseq=None, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.TRANSCRIPT] = validated_variants - - return self - - def _validate_protein_variants( - self, targetseq: Optional[str] = None, relaxed_ordering: bool = False - ) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - targetseq : - relaxed_ordering : - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._column_is_null(self.HGVSColumns.PROTEIN): - return self - - defines_nt = not self._column_is_null(self.HGVSColumns.NUCLEOTIDE) - defines_splice = not self._column_is_null(self.HGVSColumns.TRANSCRIPT) - - if defines_splice: - protein_seq = None - else: - protein_seq = targetseq - if targetseq and "dna" in infer_sequence_type(targetseq).lower(): - protein_seq, remainder = translate_dna(targetseq) - if remainder: - self._errors.insert( - 0, - "Protein variants could not be validated because the " - "length of your target sequence is not a multiple of 3", - ) - - validated_variants, _, errors = self._validate_variants( - column=self.HGVSColumns.PROTEIN, - targetseq=protein_seq, - relaxed_ordering=relaxed_ordering, - ) - - self._errors += errors - - if not self._errors: - self._df[self.HGVSColumns.PROTEIN] = validated_variants - - if not defines_nt: - self._index_column = self.HGVSColumns.PROTEIN - - return self - - def _validate_index_column(self, allow_duplicates: bool = False) -> "MaveDataset": - # TODO Django dependent - """ - - Parameters - __________ - allow_duplicates : bool - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - if self._errors: - return self - - if self._index_column is None: - self._index_column = self.HGVSColumns.NUCLEOTIDE - - if self._column_is_partially_null(self._index_column): - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"cannot contain any null values from " - f"{', '.join(readable_null_values_list)} (case-insensitive)" - ) - - if not allow_duplicates: - dupes = self._df[self._index_column].duplicated(keep=False) - if np.any(dupes): - dup_list = zip( - self._df.loc[dupes, self._index_column], dupes.index[dupes] - ) - dupes_str = ", ".join( - f"{v}: {[(g[1] + 1) for g in groups]}" # get row numbers - for (v, groups) in groupby(dup_list, key=itemgetter(0)) - ) - self._errors.append( - f"Primary column (inferred as '{self._index_column}') " - f"contains duplicate HGVS variants: {dupes_str}" - ) - - return self - - def _validate_variants( - self, - column: str, - splice_defined: Optional[bool] = None, - targetseq: Optional[str] = None, - relaxed_ordering: bool = False, - ) -> Tuple[pd.Series, Set[str], List[str]]: - # TODO Django dependent - """ - - Parameters - __________ - column : str - splice_defined : Optional[bool] - targetseq : Optional[str] - relaxed_ordering : bool - - Returns - _______ - Tuple[`pd.Series`, Set[str], List[str]] - - Raises - ______ - - """ - - prefixes = set() - errors = [] - - def validate_variant(variant: str): - # TODO Django dependent - # TODO: logic mirrors that in validate_hgvs_string, which is kept - # as a standalone function for backwards compatibility with - # django's model validator field. Merge at some point. - - if is_null(variant): - return np.NaN - else: - try: - if variant.lower() == "_sy": - errors.append( - "'_sy' is no longer supported and should be " - "replaced by 'p.(=)'" - ) - return variant - elif variant.lower() == "_wt": - errors.append( - "'_wt' is no longer supported and should be " - "replaced by one of 'g.=', 'c.=' or 'n.='" - ) - return variant - - validated = Variant( - variant, targetseq=targetseq, relaxed_ordering=relaxed_ordering - ) - prefix = validated.prefix.lower() - prefixes.add(prefix) - - prefix_error = self._validate_variant_prefix_for_column( - variant=validated, - prefix=validated.prefix, - column=column, - splice_defined=splice_defined, - ) - if prefix_error: - errors.append(prefix_error) - - return str(validated) - - except MaveHgvsParseError as error: - errors.append(f"{variant}: {str(error)}") - return np.NaN - - validated_variants = self._df[column].apply(validate_variant) - - return validated_variants, prefixes, errors - - def _column_is_null(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == len(self._df) - - def _column_is_partially_null(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return 0 < len(self._df[self._df[column].isna()]) < len(self._df) - - def _column_is_fully_specified(self, column) -> bool: - # TODO Django dependent - """ - - Parameters - __________ - column : - - Returns - _______ - bool - """ - return len(self._df[self._df[column].isna()]) == 0 - - def _validate_variant_prefix_for_column( - self, variant: Variant, prefix: str, column: str, splice_defined: bool - ) -> Optional[str]: - # TODO Django dependent - """ - - Parameters - __________ - variant : Variant - prefix : str - column : str - splice_defined : bool - - Returns - _______ - Optional[str] - - Raises - ______ - ValueError - If there is an unknown column as column argument. - """ - prefix = prefix.lower() - - if column == self.HGVSColumns.NUCLEOTIDE: - if splice_defined: - if prefix not in "g": - return ( - f"{column}: " - f"'{variant}' is not a genomic variant " - f"(prefix 'g.'). Nucleotide variants must " - f"be genomic if transcript variants are " - f"also present" - ) - else: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. " - f"The accepted transcript variant prefixes " - f"are 'c.' or 'n.'" - ) - elif column == self.HGVSColumns.TRANSCRIPT: - if prefix not in "cn": - return ( - f"{column}: " - f"'{variant}' is not a transcript variant. The " - f"accepted transcript variant prefixes are " - f"'c.' or 'n.'" - ) - elif column == self.HGVSColumns.PROTEIN: - if prefix not in "p": - return ( - f"{column}: " - f"'{variant}' is not a protein variant. " - f"The accepted protein variant prefix is 'p.'" - ) - else: - raise ValueError( - f"Unknown column '{column}'. Expected one " - f"of {', '.join(self.HGVSColumns.options())}" - ) - - return None - - -class MaveScoresDataset(MaveDataset): - # TODO - """ """ - class AdditionalColumns: - # TODO - """ """ - SCORES = required_score_column - - @classmethod - def options(cls) -> List[str]: - # TODO Django dependent - """ - - Returns - _______ - List[str] - """ - return [cls.SCORES] - - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "scores" - - def _validate_columns(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - - """ - super()._validate_columns() - - if self.AdditionalColumns.SCORES not in self.columns: - self._errors.append( - f"Your scores dataset is missing the " - f"'{self.AdditionalColumns.SCORES}' column. " - f"Columns are case-sensitive and must be comma delimited" - ) - - return self - - def _normalize_data(self) -> "MaveDataset": - # TODO Django dependent - """ - - Returns - _______ - `MaveDataset` - - Raises - ______ - ValueError - - """ - super()._normalize_data() - - should_be_numeric = [self.AdditionalColumns.SCORES] - for c in should_be_numeric: - if c in self.columns: - try: - self._df[c] = self._df[c].astype(dtype=float, errors="raise") - except ValueError as e: - self._errors.append(f"{c}: {str(e)}") - - return self - - -class MaveCountsDataset(MaveDataset): - # TODO - """ """ - @property - def label(self) -> str: - # TODO Django dependent - """ - - Returns - _______ - str - """ - return "counts" From 98ef8012d8557e1a0b049fd711a59f5590b3845a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:17:18 -0700 Subject: [PATCH 358/877] delete Django genome validators --- mavecore/validation/genome_validators.py | 588 ----------------------- 1 file changed, 588 deletions(-) delete mode 100644 mavecore/validation/genome_validators.py diff --git a/mavecore/validation/genome_validators.py b/mavecore/validation/genome_validators.py deleted file mode 100644 index ffa259e..0000000 --- a/mavecore/validation/genome_validators.py +++ /dev/null @@ -1,588 +0,0 @@ -# TODO Django dependent, Django forms, whole file needs to be refactored -""" -Validator functions for the fields of the following classes: - WildTypeSequence - ReferenceGenome - TargetGene - ReferenceMap - GenomicInterval - -Most validation should validate one specific field, unless fields need -to be validated against each other. -""" -from fqfa.validator.validator import dna_bases_validator, amino_acids_validator -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation import constants - -from mavecore.validation.utilities import is_null - - -# min_start_validator = MinValueValidator( -# 1, message=_("Start coordinate must be a positive integer.") -# ) -# min_end_validator = MinValueValidator( -# 1, message=_("End coordinate must be a positive integer.") -# ) - - -class WildTypeSequence:#g - """ - Basic model specifying a wild-type sequence. - - Parameters - ---------- - sequence : `models.CharField` - The wild type DNA sequence that is related to the `target`. Will - be converted to upper-case upon instantiation. - - sequence_type : `models.CharField` - Protein sequence (amino acids) or DNA (nucleotides) - """ - - class SequenceType:#g - """ """ - - DNA = "dna" - PROTEIN = "protein" - INFER = "infer" - - @classmethod - def detect_sequence_type(cls, sequence):#g - # TODO - # confirm sequence parameter type - """ - This function determines if the sequence is a DNA or protein sequence and - returns "dna" if it is DNA or "protein" if it is protein. An error is raised - if it is neither. - - Parameters - __________ - sequence : str - - Returns - _______ - str - "dna" or "protein" depending on if the sequence is a DNA or protein sequence. - - Raises - ______ - ValueError - If sequence parameter is not protein or DNA. - """ - if sequence_is_dna(sequence): - return cls.DNA - elif sequence_is_protein(sequence): - return cls.PROTEIN - else: - raise ValueError( - f"Unknown sequence '{sequence}'. It is not protein or DNA." - ) - - @classmethod - def is_protein(cls, value):#g - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.PROTEIN - - @classmethod - def is_dna(cls, value): #g - """ - - Parameters - __________ - value : - - Returns - _______ - - """ - return value == cls.DNA - - @classmethod - def choices(cls):#g - """ - - Returns - _______ - """ - return [(cls.INFER, "Infer"), (cls.DNA, "DNA"), (cls.PROTEIN, "Protein")] - - class Meta:#g - """ """ - - verbose_name = "Reference sequence" - verbose_name_plural = "Reference sequences" - - def __str__(self):#g - """ - - Returns - _______ - - """ - return self.get_sequence() - - # sequence = models.TextField( - # default=None, - # blank=False, - # null=False, - # verbose_name="Reference sequence", - # validation=[validate_wildtype_sequence], - # ) - # sequence_type = models.CharField( - # blank=True, - # null=False, - # default=SequenceType.INFER, - # verbose_name="Reference sequence type", - # max_length=32, - # choices=SequenceType.choices(), - # ) - - @property - def is_dna(self): - """ - - Returns - _______ - - """ - return #self.__class__.SequenceType.is_dna(self.sequence_type) - - @property - def is_protein(self): - """ - - Returns - _______ - - """ - return #self.__class__.SequenceType.is_protein(self.sequence_type) - - def save(self, *args, **kwargs): - """ - - Parameters - __________ - args : - kwargs : - - Returns - _______ - - """ - if self.sequence is not None: - self.sequence = self.sequence.upper() - self.sequence_type = ( - (self.__class__.SequenceType.detect_sequence_type(self.sequence)) - if self.__class__.SequenceType.INFER - else self.sequence_type - ) - - return super().save(*args, **kwargs) - - def get_sequence(self): - """ - - Returns - _______ - - """ - return self.sequence.upper() - - def is_attached(self): - """ - - Returns - _______ - - """ - return getattr(self, "target", None) is not None - - -# GenomicInterval -# ------------------------------------------------------------------------- # -def validate_interval_start_lteq_end(start, end): - """ - This function validates whether or not an interval's starting coordinate is less than - or equal to that interval's ending coordinate. - - Parameters - __________ - start : int - The interval's starting coordinate. - end : int - The interval's ending coordinate. - - Returns - _______ - None - If start is NoneType or end is NoneType. - - Raises - ______ - ValidationError - If an interval's starting coordinate is greater than the ending coordinate. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if start is None or end is None: - return - if start > end: - raise ValidationError( - ( - "An interval's starting coordinate cannot be greater than the " - "ending coordinate." - ) - ) - - -def validate_strand(value): - # TODO - # find the type of value - """ - This function validates a GenomicInterval strand and raises an error if the strand is invalid. - - Parameters - __________ - value : - The Genomic Interval strand to be validated. - - Raises - ______ - ValidationError - If GenomicInterval strand is not positive or negative. - """ - if value not in ("+", "-"): - raise ValidationError("GenomicInterval strand must be either '+' or '-'") - - -def validate_chromosome(value): - # TODO - # add description and type for value parameter - """ - - Parameters - __________ - value : - - Returns - _______ - None - If value is NoneType. - - Raises - ______ - ValidationError - If chromosome identifier is null. - """ - # Intervals may be underspecified, but will be ignored so skip validation. - if value is None: - return - if is_null(value): - raise ValidationError("Chromosome identifier must not be null.") - - -def validate_unique_intervals(intervals): - # TODO - # add description and interval parameter type plus description - """ - - Parameters - __________ - intervals : - - Raises - ______ - ValidationError - If the same interval was specified twice. - """ - for interval1 in intervals: - for interval2 in intervals: - if ( - (interval1.pk is not None) - and (interval2.pk is not None) - and (interval1.pk == interval2.pk) - ): - continue - elif interval1 is interval2: - continue - elif interval1.equals(interval2): - raise ValidationError("You can not specify the same interval twice.") - - -# WildTypeSequence -# ------------------------------------------------------------------------- # -def validate_wildtype_sequence(seq, as_type="any"): - # TODO - # add description to as_type parameter - """ - This function checks whether or not seq is a wildtype sequence. - - Parameters - __________ - seq : str - The sequence being validated. - as_type : str - (default = "any") - - Raises - ______ - ValidationError - If seq is not a valid wild type sequence. - ValidationError - If seq is not a valid DNA or protein reference sequence. - """ - # from .models import WildTypeSequence - - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - raise ValidationError( - "'%(seq)s' is not a valid wild type sequence." # , params={"seq": seq} - ) - - seq = seq.upper() - is_dna = dna_bases_validator(seq) is not None - is_aa = amino_acids_validator(seq) is not None - - if as_type == WildTypeSequence.SequenceType.DNA and not is_dna: - raise ValidationError( - "'%(seq)s' is not a valid DNA reference sequence." # , - # params={"seq": seq}, - ) - elif as_type == WildTypeSequence.SequenceType.PROTEIN and not is_aa: - raise ValidationError( - "'%(seq)s' is not a valid protein reference sequence." # , - # params={"seq": seq}, - ) - elif (as_type == "any" or WildTypeSequence.SequenceType.INFER) and not ( - is_dna or is_aa - ): - raise ValidationError( - "'%(seq)s' is not a valid DNA or protein reference sequence." # , - # params={"seq": seq}, - ) - - -def sequence_is_dna(seq):#g - """ - This function checks if seq is a DNA sequence. - - Parameters - __________ - seq : str - The sequence to be validated. - - Returns - _______ - bool - True if the dna_bases_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - return dna_bases_validator(seq) is not None - - -def sequence_is_protein(seq):#g - """ - This function check if seq is a protein sequence. - - Parameters - __________ - seq : str - The sequence being validated. - - Returns - _______ - bool - True if seq is not null, is a DNA sequence or amino_acids_validator returns a match object. - """ - # Explicitly check for these cases as they are also valid AA sequences. - if is_null(seq): - return False - seq = seq.upper() - if dna_bases_validator(seq) is not None: - return False # Very likely a DNA sequence if only ATG - return amino_acids_validator(seq) is not None - - -# ReferenceGenome -# ------------------------------------------------------------------------- # -def validate_organism_name(organism_name): - """ - This function validates the organism name by checking that the name is not null. - - Parameters - __________ - organism_name : str - The organism name to be validated. - - Raises - ______ - ValidationError - If the organism name is null. - """ - if is_null(organism_name): - raise ValidationError("Species name must not be null.") - - -def validate_reference_genome_has_one_external_identifier(referencegenome): - # TODO - # revise description, make sure it is accurate - # anything greater than 0 will return True, so should it be == 1 or > 0? - # determine what type referencegenome is - """ - This function validates whether or not the reference genome has one external identifier. - An error is raised if - - Parameters - __________ - referencegenome : - - Raises - ______ - ValidationError - If - """ - if not referencegenome.genome_id: - raise ValidationError( - "Only one external identifier can be specified for a reference" "genome." - ) - - -def validate_genome_short_name(value): - # TODO - # confirm the type of the value parameter - """ - This function validates the genome short name and raises an error if the value is null. - - Parameters - __________ - value : str - The genome short name to be validated. - - Raises - ______ - ValidationError - If the genome short name is null. - """ - if is_null(value): - raise ValidationError("Genome short name must not be null.") - - -# ReferenceMap -# ------------------------------------------------------------------------- # -def validate_map_has_unique_reference_genome(annotations): - # TODO - # check the type of annotations - # add description to annotations parameter - """ - This function validates whether or not each map in annotations has a - unique reference genome and raises an error if this is not the case. - - Parameters - __________ - annotations : - - Raises - ______ - ValidationError - If each reference map does not specify a different reference genome. - """ - genomes = set([str(a.get_reference_genome_name()).lower() for a in annotations]) - if len(genomes) < len(annotations): - raise ValidationError( - "Each reference map must specify a different reference genome." - ) - - -def validate_map_has_at_least_one_interval(reference_map): - """ - This function validates that a reference map has at least one interval and raises an error - if this is not the case. - - Parameters - __________ - reference_map : - Reference map. - - Raises - ______ - ValidationError - If the reference_map does not have at least one interval. - """ - if not reference_map.get_intervals().count(): - raise ValidationError( - "You must specify at least one interval for each reference map." - ) - - -def validate_at_least_one_map(reference_maps): - """ - This function validates whether a target has at least one reference map specified - and raises an error if it does not. - - Parameters - __________ - reference_maps : - - - Raises - ______ - ValidationError - If the target does not have at least one reference map specified. - """ - if not len(reference_maps): - raise ValidationError( - "A target must have at least one reference map specified." - ) - - -def validate_one_primary_map(reference_maps): - """ - This function validates the existence of one primary reference map and raises an error - if it does not exist. - - Parameters - __________ - reference_maps : - - Raises - ______ - ValidationError - If target has less than or more than one primary reference map. - """ - primary_count = sum(a.is_primary_reference_map() for a in reference_maps) - if primary_count > 1 or primary_count < 1: - raise ValidationError("A target must have one primary reference map.") - - -# TargetGene -# ------------------------------------------------------------------------- # -def validate_gene_name(gene_name): - # TODO - # confirm gene_name type - """ - This function checks to see if a gene name is null and raises and error if it is. - - Parameters - __________ - gene_name : str - The gene name. - - Raises - ______ - ValidationError - If gene name (value parameter) is null. - """ - if is_null(gene_name): - raise ValidationError("Gene name must not be null.") From 7981eb3577b0d2fe338e23a0d87fd83bfef4a4a0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:17:28 -0700 Subject: [PATCH 359/877] delete test cases for Django genome validators --- .../test_validation/test_genome_validators.py | 137 ------------------ 1 file changed, 137 deletions(-) delete mode 100644 tests/test_validation/test_genome_validators.py diff --git a/tests/test_validation/test_genome_validators.py b/tests/test_validation/test_genome_validators.py deleted file mode 100644 index 6f36283..0000000 --- a/tests/test_validation/test_genome_validators.py +++ /dev/null @@ -1,137 +0,0 @@ -from unittest import TestCase - -from mavecore.validation.genome_validators import WildTypeSequence - -# from mavetools.validation.genome_factories import ( -# ReferenceMapFactory, -# ReferenceGenomeFactory, -# GenomicIntervalFactory, -# ) - - -from mavecore.validation.genome_validators import ( - validate_wildtype_sequence, - validate_gene_name, - validate_genome_short_name, - validate_organism_name, - sequence_is_protein, - sequence_is_dna, -) -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.constants import null_values_list - - -class TestWildTypeSequenceValidators(TestCase): - """ - Tests validation associated with :class:`WildTypeSequence`. Tests: - - - validate_wildtype_sequence - """ - - def test_ve_not_a_sequence_of_nucleotides_or_aa(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence("2823d") - - def test_ve_null(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_wildtype_sequence(v) - - def test_passes_lowercase_nucleotides(self): - validate_wildtype_sequence("atcg") - - def test_passes_uppercase_nucleotides(self): - validate_wildtype_sequence("ATCG") - - def test_passes_lowercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE") - - def test_passes_uppercase_aa(self): - validate_wildtype_sequence("MDLSALRVEE".lower()) - - def test_pass_validate_dna_sequence(self): - validate_wildtype_sequence("ATCG", as_type=WildTypeSequence.SequenceType.DNA) - - def test_pass_validate_protein_sequence(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - def test_fails_validate_as_type_dna_but_seq_is_protein(self): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "MDLS", as_type=WildTypeSequence.SequenceType.DNA - ) - - def test_fail_validate_as_type_protein_when_sequence_is_invalid(self): - with self.assertRaises(ValidationError): - validate_wildtype_sequence( - "ABC", as_type=WildTypeSequence.SequenceType.PROTEIN - ) - - -class TestIsProteinSequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_false_dna_sequence(self): - # Favor dna sequences when only ATCG - self.assertFalse(sequence_is_protein("ATCG")) - self.assertFalse(sequence_is_protein("atc")) - - def test_true_aa_sequence(self): - self.assertTrue(sequence_is_protein("MDLSALRVEEATC")) - self.assertTrue(sequence_is_protein("MDLSALRVEEATC".lower())) - - -class TestIsDNASequence(TestCase): - def test_false_null(self): - for v in null_values_list: - self.assertFalse(sequence_is_protein(v)) - - def test_true_dna_sequence(self): - self.assertTrue(sequence_is_dna("ATCG")) - self.assertTrue(sequence_is_dna("atc")) - - def test_false_aa_sequence(self): - self.assertFalse(sequence_is_dna("MDLSALRVEEATC")) - self.assertFalse(sequence_is_dna("MDLSALRVEEATC".lower())) - - -class TestReferenceGenomeValidators(TestCase): - """ - Tests validation associated with :class:`ReferenceGenome`: - - - validate_reference_genome_has_one_external_identifier - - validate_organism_name - - validate_genome_short_name - """ - - def test_ve_null_organism_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_organism_name(v) - - def test_ve_null_genome_short_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_genome_short_name(v) - - -class TestTargetGeneValidators(TestCase): - """ - Tests validation asscociated with :class:`TargetGene`: - - - validate_gene_name - - validate_target_has_one_primary_reference_map - """ - - def test_ve_null_gene_name(self): - for v in null_values_list: - with self.assertRaises(ValidationError): - validate_gene_name(v) From 04e0087800e2421f1bf1dcb05ccfb2e43c3421ff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 22 Jun 2022 21:17:42 -0700 Subject: [PATCH 360/877] delete full payload validators --- mavecore/validation/validate.py | 69 --------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 mavecore/validation/validate.py diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py deleted file mode 100644 index b138c9a..0000000 --- a/mavecore/validation/validate.py +++ /dev/null @@ -1,69 +0,0 @@ -from mavecore.validation import dataset_validators - - -def validate_all(countfile=None, scorefile=None, scorejson=None): - """ - By calling other helper functions, this function runs all of the validation code. - - Parameters - __________ - countfile : - scorefile : - scorejson : - - """ - validate_dataset(countfile, scorefile, scorejson) - - -def validate_dataset(countfile=None, scorefile=None, scorejson=None): - """ - This function calls all of the validation functions within - mavetools/mavetools/validation/dataset_validation.py - - Parameters - __________ - countfile : - scorefile : - scorejson : - - Returns - ------- - - """ - - # how to incorporate word limit validator? - - if scorefile is not None: - # open scorefile - open(scorefile) - # this one returns header - scoreheader = dataset_validators.read_header_from_io(file=scorefile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=scoreheader) - dataset_validators.validate_at_least_one_additional_column(header=scoreheader) - dataset_validators.validate_header_contains_no_null_columns(header=scoreheader) - - dataset_validators.validate_scoreset_score_data_input(file=scorefile) - - if scorejson is not None: - # open scorejson - open(scorejson) - dataset_validators.validate_scoreset_json(dict_=scorejson) - - if countfile is not None: - # open countfile - open(countfile) - countheader = dataset_validators.read_header_from_io(file=countfile) - - # if the header was returned, do these ones - dataset_validators.validate_has_hgvs_in_header(header=countheader) - dataset_validators.validate_at_least_one_additional_column(header=countheader) - dataset_validators.validate_header_contains_no_null_columns(header=countheader) - - dataset_validators.validate_scoreset_count_data_input(file=countfile) - - if scorefile is not None and countfile is not None: - dataset_validators.validate_datasets_define_same_variants( - scores=scorefile, counts=countfile - ) From 39488458f602f9d8b248d52e3d89c2d6b82f4f31 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:00:21 -0700 Subject: [PATCH 361/877] initial commit, rewrite_validation --- mavecore/validation_new/__init__.py | 0 mavecore/validation_new/experiment.py | 46 ++++++++++ mavecore/validation_new/general.py | 6 ++ mavecore/validation_new/identifiers.py | 14 +++ mavecore/validation_new/metadata.py | 0 mavecore/validation_new/scoreset.py | 117 +++++++++++++++++++++++++ mavecore/validation_new/summary.py | 19 ++++ mavecore/validation_new/urn.py | 0 mavecore/validation_new/user.py | 4 + 9 files changed, 206 insertions(+) create mode 100644 mavecore/validation_new/__init__.py create mode 100644 mavecore/validation_new/experiment.py create mode 100644 mavecore/validation_new/general.py create mode 100644 mavecore/validation_new/identifiers.py create mode 100644 mavecore/validation_new/metadata.py create mode 100644 mavecore/validation_new/scoreset.py create mode 100644 mavecore/validation_new/summary.py create mode 100644 mavecore/validation_new/urn.py create mode 100644 mavecore/validation_new/user.py diff --git a/mavecore/validation_new/__init__.py b/mavecore/validation_new/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py new file mode 100644 index 0000000..11bc056 --- /dev/null +++ b/mavecore/validation_new/experiment.py @@ -0,0 +1,46 @@ +[ + { + "title": "string", + "shortDescription": "string", + "abstractText": "string", + "methodText": "string", + "extraMetadata": {}, + "keywords": [ + "string" + ], + "urn": "string", + "numScoresets": 0, + "createdBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "modifiedBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "publishedDate": "2022-08-02", + "experimentSetUrn": "string", + "doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } + ], + "processingState": "string" + } +] \ No newline at end of file diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py new file mode 100644 index 0000000..da0f32e --- /dev/null +++ b/mavecore/validation_new/general.py @@ -0,0 +1,6 @@ +def validate_processing_state(processingState): + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "publishedDate": "2022-08-02", + +def validate_date(date): \ No newline at end of file diff --git a/mavecore/validation_new/identifiers.py b/mavecore/validation_new/identifiers.py new file mode 100644 index 0000000..f0dba18 --- /dev/null +++ b/mavecore/validation_new/identifiers.py @@ -0,0 +1,14 @@ +"doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } \ No newline at end of file diff --git a/mavecore/validation_new/metadata.py b/mavecore/validation_new/metadata.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py new file mode 100644 index 0000000..ccfa730 --- /dev/null +++ b/mavecore/validation_new/scoreset.py @@ -0,0 +1,117 @@ +{ + "urn": "string", + "title": "string", + "methodText": "string", + "abstractText": "string", + "shortDescription": "string", + "extraMetadata": {}, + "dataUsagePolicy": "string", + "licenceId": 0, + "replacesId": 0, + "keywords": [ + "string" + ], + "numVariants": 0, + "experiment": { + "title": "string", + "shortDescription": "string", + "abstractText": "string", + "methodText": "string", + "extraMetadata": {}, + "keywords": [ + "string" + ], + "urn": "string", + "numScoresets": 0, + "createdBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "modifiedBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "publishedDate": "2022-08-02", + "experimentSetUrn": "string", + "doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } + ], + "processingState": "string" + }, + "doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } + ], + "publishedDate": "2022-08-02", + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "createdBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "modifiedBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "targetGene": { + "name": "string", + "category": "string", + "referenceMaps": [ + { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": true, + "genome": { + "shortName": "string", + "organismName": "string", + "genomeId": 0, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "id": 0 + }, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02" + } + ], + "wtSequence": { + "sequenceType": "string", + "sequence": "string" + } + }, + "datasetColumns": {}, + "private": true +} \ No newline at end of file diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py new file mode 100644 index 0000000..0b10540 --- /dev/null +++ b/mavecore/validation_new/summary.py @@ -0,0 +1,19 @@ +def validate_title(title): + +def validate_short_description(shortDescription): + +def validate_abstract(abstractText): + """ + + :param absract: + :return: + """ + return None + +def validate_methods(methodText): + +def validate_keywords(keywords): +"methodText": "string", + "extraMetadata": {}, + "keywords": [ +def validate_num_scoresets(numScoresets): diff --git a/mavecore/validation_new/urn.py b/mavecore/validation_new/urn.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/validation_new/user.py b/mavecore/validation_new/user.py new file mode 100644 index 0000000..27c25dd --- /dev/null +++ b/mavecore/validation_new/user.py @@ -0,0 +1,4 @@ +def validate_orcid_id(orcid_id): +def validate_first_name(firstName): +def validate_last_name(lastName): +def validate_email(email): \ No newline at end of file From eda3a4145f93b1eae570a4892ea17af43c0643d7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:06:06 -0700 Subject: [PATCH 362/877] docstring from scoreset validation --- mavecore/validation_new/scoreset.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index ccfa730..5ab1ced 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -1,3 +1,24 @@ +def validate_scoreset(scoreset, files): + """ + Validates a scoreset represented as a dictionary. + + Parameters: + __________ + scoreset: Dict + The scoreset that will be validated. + files: path + The path to the files belonging to the scoreset. + + Raises: + ______ + ValidationError + If any validation fails. + """ + try: + validate_urn(scoreset.get("urn")) + validate_title(title) + + { "urn": "string", "title": "string", From cc2f62628f5d4a7465027cf578470053811ad9e4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:14:56 -0700 Subject: [PATCH 363/877] add urn constants --- mavecore/validation_new/constants/__init__.py | 0 mavecore/validation_new/constants/urn.py | 54 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 mavecore/validation_new/constants/__init__.py create mode 100644 mavecore/validation_new/constants/urn.py diff --git a/mavecore/validation_new/constants/__init__.py b/mavecore/validation_new/constants/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/validation_new/constants/urn.py b/mavecore/validation_new/constants/urn.py new file mode 100644 index 0000000..54d9b22 --- /dev/null +++ b/mavecore/validation_new/constants/urn.py @@ -0,0 +1,54 @@ +import re + +MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 +MAVEDB_TMP_URN_DIGITS = 16 +MAVEDB_URN_MAX_LENGTH = 64 +MAVEDB_URN_NAMESPACE = "mavedb" + + +# Temp URN patterns +# --------------------------------------------------------------------------- # +MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( + width=MAVEDB_TMP_URN_DIGITS +) +MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) + + +# Experimentset Pattern/Compiled RE +MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( + namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS +) +MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) + +# Experiment Pattern/Compiled RE +MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( + pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] +) +MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) + +# Scoreset Pattern/Compiled RE +MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( + pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] +) +MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) + +# Variant Pattern/Compiled RE +MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( + pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] +) +MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) + +# Any Pattern/Compiled RE +MAVEDB_ANY_URN_PATTERN = "|".join( + [ + r"({pattern})".format(pattern=p) + for p in ( + MAVEDB_EXPERIMENTSET_URN_PATTERN, + MAVEDB_EXPERIMENT_URN_PATTERN, + MAVEDB_SCORESET_URN_PATTERN, + MAVEDB_VARIANT_URN_PATTERN, + MAVEDB_TMP_URN_PATTERN, + ) + ] +) +MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) \ No newline at end of file From 1ec047fce6d9bfb3442040c42a990c316f24a0c7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:15:07 -0700 Subject: [PATCH 364/877] add urn validation --- mavecore/validation_new/urn.py | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/mavecore/validation_new/urn.py b/mavecore/validation_new/urn.py index e69de29..cf3162e 100644 --- a/mavecore/validation_new/urn.py +++ b/mavecore/validation_new/urn.py @@ -0,0 +1,97 @@ +from constants.urn import * +from mavecore.validation.exceptions import ValidationError + + +def validate_urn(urn): + """ + This function validates a MaveDB urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB urn is not valid. + """ + if not MAVEDB_ANY_URN_RE.match(urn): + raise ValidationError("{}'s is not a valid urn.".format(urn)) + + +def validate_experimentset_urn(urn): + """ + This function validates a Experiment Set urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment Set urn to be validated. + + Raises + ______ + ValidationError + If the Experiment Set urn is not valid. + """ + if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + # "Error test" + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + + +def validate_experiment_urn(urn): + """ + This function validates an Experiment urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Experiment urn to be validated. + + Raises + ______ + ValidationError + If the Experiemnt urn is not valid. + """ + if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "{}'s is not a valid Experiment urn.".format(urn) + ) + + +def validate_scoreset_urn(urn): + """ + This function validates a Scoreset urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The Scoreset urn to be validated + + Raises + ______ + ValidationError + If the Scoreset urn is not valid. + """ + if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError("{}'s is not a valid score set urn.".format(urn)) + + +def validate_variant_urn(urn): + """ + This function validates a MaveDB Variant urn and raises an error if it is not valid. + + Parameters + __________ + urn : str + The MaveDB Variant urn to be validated. + + Raises + ______ + ValidationError + If the MaveDB Variant urn is not valid. + """ + if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) From 0f86cdc2337cb007a5adea4ce3c3f22f6a809568 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:23:48 -0700 Subject: [PATCH 365/877] write docstring and outline validate title --- mavecore/validation_new/summary.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 0b10540..e9ce0d2 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -1,4 +1,19 @@ def validate_title(title): + """ + Validates a title to an experiment set, an experiment, or a scoreset. + + Parameters: + __________ + title: str + The title to be validated. + + Raises: + ______ + ValidationError if the title is not valid. + """ + # check if title is a string + + # check that title is not too long def validate_short_description(shortDescription): From 516368d8264247ee01718d1d63862d8fe9fc9b32 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:26:53 -0700 Subject: [PATCH 366/877] add title validation code --- mavecore/validation_new/summary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index e9ce0d2..2cc397e 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -1,3 +1,4 @@ +from exceptions import ValidationError def validate_title(title): """ Validates a title to an experiment set, an experiment, or a scoreset. @@ -12,6 +13,7 @@ def validate_title(title): ValidationError if the title is not valid. """ # check if title is a string + if type(title) != str: raise ValidationError("{}'s is not a valid title.".format(title)) # check that title is not too long From b84345ef024e6fcf61154de7ebcb6ba6d95f9458 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:04:15 -0700 Subject: [PATCH 367/877] add exceptions.py --- mavecore/validation_new/exceptions.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 mavecore/validation_new/exceptions.py diff --git a/mavecore/validation_new/exceptions.py b/mavecore/validation_new/exceptions.py new file mode 100644 index 0000000..b3e419b --- /dev/null +++ b/mavecore/validation_new/exceptions.py @@ -0,0 +1,5 @@ +NON_FIELD_ERRORS = "__all__" + + +class ValidationError(ValueError): + None \ No newline at end of file From 3411a1f1cf8b25319498d34d3bbe5f622ff8e033 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:04:30 -0700 Subject: [PATCH 368/877] add general constants --- mavecore/validation_new/constants/general.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 mavecore/validation_new/constants/general.py diff --git a/mavecore/validation_new/constants/general.py b/mavecore/validation_new/constants/general.py new file mode 100644 index 0000000..82ff0d7 --- /dev/null +++ b/mavecore/validation_new/constants/general.py @@ -0,0 +1,2 @@ +# valid data usage policies + From 59c7731a7b3821e62e0f1eb7bc32eaf38559e7c9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:04:57 -0700 Subject: [PATCH 369/877] declare general validation and start docstrings --- mavecore/validation_new/general.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index da0f32e..2d1fd44 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -1,3 +1,21 @@ +def validate_data_usage_policy(dataUsagePolicy): + """ + Validates + :param dataUsagePolicy: + :return: + """ +def validate_license_id(licenseId): + """ + + :param licenseId: + :return: + """ +def validate_replaces_id(replacesId): + """ + + :param replacesId: + :return: + """ def validate_processing_state(processingState): "creationDate": "2022-08-02", "modificationDate": "2022-08-02", From 6741f220ebfb7d9e5149a585aca62c0a9844217f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:05:18 -0700 Subject: [PATCH 370/877] add note to validate attribute --- mavecore/validation_new/general.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index 2d1fd44..fa30955 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -21,4 +21,6 @@ def validate_processing_state(processingState): "modificationDate": "2022-08-02", "publishedDate": "2022-08-02", -def validate_date(date): \ No newline at end of file +def validate_date(date): + +"private": true \ No newline at end of file From 20c9d6bdf0d3570e1c9013d345846072c3cd4b48 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:06:02 -0700 Subject: [PATCH 371/877] declare metadata validation and start docstring, mark function as TODO --- mavecore/validation_new/metadata.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation_new/metadata.py b/mavecore/validation_new/metadata.py index e69de29..981bf62 100644 --- a/mavecore/validation_new/metadata.py +++ b/mavecore/validation_new/metadata.py @@ -0,0 +1,15 @@ +def validate_metadata(extraMetadata): + """ + This function validates metadata associated with an upload. + + Parameters: + __________ + extraMetadata: Dict + The metadata to be validated. + + Raises: + ______ + ValidationError + If any of the key:value pairs are not valid. + """ + #TODO \ No newline at end of file From 62233993e44e182d95fbe7b95b60b2ae32d7518f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:06:12 -0700 Subject: [PATCH 372/877] edit imports --- mavecore/validation_new/scoreset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 5ab1ced..ac77635 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -1,3 +1,6 @@ +import urn, summary, metadata + + def validate_scoreset(scoreset, files): """ Validates a scoreset represented as a dictionary. From a4cb0ede97284725678b2e3b66be2f936619d0e6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:06:48 -0700 Subject: [PATCH 373/877] validate scoreset urn, summary info and metadata --- mavecore/validation_new/scoreset.py | 31 +++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index ac77635..6cd171c 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -17,21 +17,22 @@ def validate_scoreset(scoreset, files): ValidationError If any validation fails. """ - try: - validate_urn(scoreset.get("urn")) - validate_title(title) - - -{ - "urn": "string", - "title": "string", - "methodText": "string", - "abstractText": "string", - "shortDescription": "string", - "extraMetadata": {}, - "dataUsagePolicy": "string", - "licenceId": 0, - "replacesId": 0, + # { + # "urn": "string", + # "title": "string", + # "methodText": "string", + # "abstractText": "string", + # "shortDescription": "string", + urn.validate_scoreset_urn(scoreset.get("urn")) + summary.validate_title(scoreset.get("title")) + summary.validate_methods(scoreset.get("methodText")) + summary.validate_abstract(scoreset.get("abstractText")) + summary.validate_short_description(scoreset.get("shortDescription")) + #"extraMetadata": {}, + metadata.validate_metadata(scoreset.get("extraMetadata")) + #"dataUsagePolicy": "string", + #"licenceId": 0, + #"replacesId": 0, "keywords": [ "string" ], From 390d9ba021c215cb61e193d15c79e1e8ad57e807 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:07:09 -0700 Subject: [PATCH 374/877] create file to declare summary constants --- mavecore/validation_new/constants/summary.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 mavecore/validation_new/constants/summary.py diff --git a/mavecore/validation_new/constants/summary.py b/mavecore/validation_new/constants/summary.py new file mode 100644 index 0000000..4abaab2 --- /dev/null +++ b/mavecore/validation_new/constants/summary.py @@ -0,0 +1 @@ +# valid keywords \ No newline at end of file From 536a56bfebd8b95420b799dab0baa13231faeff1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:07:24 -0700 Subject: [PATCH 375/877] edit title validation docstring --- mavecore/validation_new/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 2cc397e..8e8e189 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -1,7 +1,7 @@ from exceptions import ValidationError def validate_title(title): """ - Validates a title to an experiment set, an experiment, or a scoreset. + Validates a title of an experiment set, an experiment, or a scoreset. Parameters: __________ From 6efca80dfdede523a342d3d3ad4c881687baf70d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:07:43 -0700 Subject: [PATCH 376/877] edit validation error message --- mavecore/validation_new/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 8e8e189..71cc90a 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -13,7 +13,7 @@ def validate_title(title): ValidationError if the title is not valid. """ # check if title is a string - if type(title) != str: raise ValidationError("{}'s is not a valid title.".format(title)) + if type(title) != str: raise ValidationError("The title must be a string.") # check that title is not too long From 117694a2433e478ecfd176c39da984137fa02d35 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:08:04 -0700 Subject: [PATCH 377/877] write docstring and validation code to validate short description --- mavecore/validation_new/summary.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 71cc90a..cfb429d 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -18,6 +18,24 @@ def validate_title(title): # check that title is not too long def validate_short_description(shortDescription): + """ + Validates the short description of an experiment set, an experiment, or a scoreset. + + Parameters: + __________ + shortDescription: str + The short description to be validated. + + Raises: + ______ + ValidationError if the short description is too long or is not a string. + """ + # check if short description is a string + if type(shortDescription) != str: raise ValidationError("The short description must be a string.") + + # check if short description is too long + count = len(shortDescription.split(" ")) + if count > 50: raise ValidationError("The short description must be less than or equal to 50 words.") def validate_abstract(abstractText): """ From 530a274dc2957e7346f91da9c2e965bc484e3a0e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:08:28 -0700 Subject: [PATCH 378/877] write validate abstract docstring and function --- mavecore/validation_new/summary.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index cfb429d..c1593ac 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -39,11 +39,23 @@ def validate_short_description(shortDescription): def validate_abstract(abstractText): """ + Validates the abstract of an experiment set, an experiment, or a scoreset. - :param absract: - :return: + Parameters: + __________ + abstractText: str + The abstract to be validated. + + Raises: + ______ + ValidationError if the abstract is too long or is not a string. """ - return None + # check if short description is a string + if type(abstractText) != str: raise ValidationError("The abstract must be a string.") + + # check if short description is too long + count = len(abstractText.split(" ")) + if count > 200: raise ValidationError("The abstract must be less than or equal to 200 words.") def validate_methods(methodText): From 9580f57920c784d1b91ac1f6cafa6416a02ffac7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:08:39 -0700 Subject: [PATCH 379/877] write validate methods docstring and function --- mavecore/validation_new/summary.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index c1593ac..b6305e9 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -58,6 +58,24 @@ def validate_abstract(abstractText): if count > 200: raise ValidationError("The abstract must be less than or equal to 200 words.") def validate_methods(methodText): + """ + Validates the methods of an experiment set, an experiment, or a scoreset. + + Parameters: + __________ + methodText: str + The methods to be validated. + + Raises: + ______ + ValidationError if the abstract is too long or is not a string. + """ + # check if short description is a string + if type(methodText) != str: raise ValidationError("The methods must be a string.") + + # check if short description is too long + count = len(methodText.split(" ")) + if count > 200: raise ValidationError("The methods must be less than or equal to 200 words.") def validate_keywords(keywords): "methodText": "string", From 529625813183b242a6af80c29317f391c178b94d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:08:53 -0700 Subject: [PATCH 380/877] write validate keywords docstring --- mavecore/validation_new/summary.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index b6305e9..15e14e4 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -78,7 +78,17 @@ def validate_methods(methodText): if count > 200: raise ValidationError("The methods must be less than or equal to 200 words.") def validate_keywords(keywords): -"methodText": "string", - "extraMetadata": {}, + """ + Validates the methods of an experiment set, an experiment, or a scoreset. + + Parameters: + __________ + methodText: str + The methods to be validated. + + Raises: + ______ + ValidationError if the abstract is too long or is not a string. + """ "keywords": [ def validate_num_scoresets(numScoresets): From 45179aa7edb4a76b71e416e4e145e7a0834c3a65 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:09:30 -0700 Subject: [PATCH 381/877] note target gene attributes --- mavecore/validation_new/target.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 mavecore/validation_new/target.py diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py new file mode 100644 index 0000000..40d5a74 --- /dev/null +++ b/mavecore/validation_new/target.py @@ -0,0 +1,26 @@ +"targetGene": { + "name": "string", + "category": "string", + "referenceMaps": [ + { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": true, + "genome": { + "shortName": "string", + "organismName": "string", + "genomeId": 0, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02", + "id": 0 + }, + "creationDate": "2022-08-02", + "modificationDate": "2022-08-02" + } + ], + "wtSequence": { + "sequenceType": "string", + "sequence": "string" + } + }, \ No newline at end of file From 402cafbc569d5c570c497a2915f188a702726fe6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:36:58 -0700 Subject: [PATCH 382/877] declare function to validate experiment and write docstring --- mavecore/validation_new/experiment.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 11bc056..de20877 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -1,3 +1,17 @@ +def validate_experiemnt(experiment): + """ + This function validates an experiment. + + Parameters: + __________ + experiment: dict + The experiment represented as a dictionary. + + Raises: + ______ + ValidationError + If the experiment is not a dictionary or if any key:value pair in the experiment is not valid. + """ [ { "title": "string", From d4dc54046e87d23a31e35215dc934a5ba020accb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:37:23 -0700 Subject: [PATCH 383/877] note all attributes of an experiment --- mavecore/validation_new/experiment.py | 47 ++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index de20877..891a782 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -57,4 +57,49 @@ def validate_experiemnt(experiment): ], "processingState": "string" } -] \ No newline at end of file +] + + # "experiment": { + # "title": "string", + # "shortDescription": "string", + # "abstractText": "string", + # "methodText": "string", + # "extraMetadata": {}, + # "keywords": [ + # "string" + # ], + # "urn": "string", + # "numScoresets": 0, + # "createdBy": { + # "orcid_id": "string", + # "firstName": "string", + # "lastName": "string", + # "email": "string" + # }, + # "modifiedBy": { + # "orcid_id": "string", + # "firstName": "string", + # "lastName": "string", + # "email": "string" + # }, + # "creationDate": "2022-08-02", + # "modificationDate": "2022-08-02", + # "publishedDate": "2022-08-02", + # "experimentSetUrn": "string", + # "doiIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string" + # } + # ], + # "pubmedIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string", + # "referenceHtml": "string" + # } + # ], + # "processingState": "string" + # }, \ No newline at end of file From 093c56697e45644816a6bb826e8ff8e60e70454e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:37:34 -0700 Subject: [PATCH 384/877] edit imports --- mavecore/validation_new/scoreset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 6cd171c..82462ec 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -1,4 +1,5 @@ -import urn, summary, metadata +from exceptions import ValidationError +import urn, summary, metadata, general, experiment def validate_scoreset(scoreset, files): From 23b91437a1615935de6c48e607b10d87979a9544 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:37:51 -0700 Subject: [PATCH 385/877] edit validation error description --- mavecore/validation_new/scoreset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 82462ec..72a9d25 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -16,7 +16,7 @@ def validate_scoreset(scoreset, files): Raises: ______ ValidationError - If any validation fails. + If scoreset is not a dictionary or if any additional validation fails. """ # { # "urn": "string", From e8d47a90c9a9548c7b83362492e8a1cce9481456 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:38:07 -0700 Subject: [PATCH 386/877] check type of scoreset object --- mavecore/validation_new/scoreset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 72a9d25..b17fac8 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -18,6 +18,8 @@ def validate_scoreset(scoreset, files): ValidationError If scoreset is not a dictionary or if any additional validation fails. """ + # first validate that scoreset is a dictionary + if type(scoreset) != dict: raise ValidationError("The scoreset must be a dictionary.") # { # "urn": "string", # "title": "string", From 8dbe89ebe064331caa6d20b961994fc6599c075d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:38:30 -0700 Subject: [PATCH 387/877] valiate scoreset metadata --- mavecore/validation_new/scoreset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index b17fac8..0d339bc 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -31,7 +31,7 @@ def validate_scoreset(scoreset, files): summary.validate_methods(scoreset.get("methodText")) summary.validate_abstract(scoreset.get("abstractText")) summary.validate_short_description(scoreset.get("shortDescription")) - #"extraMetadata": {}, + # "extraMetadata": {}, metadata.validate_metadata(scoreset.get("extraMetadata")) #"dataUsagePolicy": "string", #"licenceId": 0, From 43b1de5bddb4ea00fa5fe862a4bc0c57c0d451ac Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:39:04 -0700 Subject: [PATCH 388/877] validate general and summary information and comment scoreset attributes --- mavecore/validation_new/scoreset.py | 185 +++++++++++----------------- 1 file changed, 75 insertions(+), 110 deletions(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 0d339bc..fddd618 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -33,113 +33,78 @@ def validate_scoreset(scoreset, files): summary.validate_short_description(scoreset.get("shortDescription")) # "extraMetadata": {}, metadata.validate_metadata(scoreset.get("extraMetadata")) - #"dataUsagePolicy": "string", - #"licenceId": 0, - #"replacesId": 0, - "keywords": [ - "string" - ], - "numVariants": 0, - "experiment": { - "title": "string", - "shortDescription": "string", - "abstractText": "string", - "methodText": "string", - "extraMetadata": {}, - "keywords": [ - "string" - ], - "urn": "string", - "numScoresets": 0, - "createdBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "modifiedBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "publishedDate": "2022-08-02", - "experimentSetUrn": "string", - "doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } - ], - "processingState": "string" - }, - "doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } - ], - "publishedDate": "2022-08-02", - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "createdBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "modifiedBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "targetGene": { - "name": "string", - "category": "string", - "referenceMaps": [ - { - "id": 0, - "genomeId": 0, - "targetId": 0, - "isPrimary": true, - "genome": { - "shortName": "string", - "organismName": "string", - "genomeId": 0, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "id": 0 - }, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02" - } - ], - "wtSequence": { - "sequenceType": "string", - "sequence": "string" - } - }, - "datasetColumns": {}, - "private": true -} \ No newline at end of file + # "dataUsagePolicy": "string", + # "licenceId": 0, + # "replacesId": 0, + general.validate_data_usage_policy(scoreset.get("dataUsagePolicy")) + general.validate_license_id(scoreset.get("licenseId")) + general.validate_replaces_id(scoreset.get("replacesId")) + # "keywords": [ + # "string" + # ], + summary.validate_keywords(scoreset.get("keywords")) + # "numVariants": 0, + summary.validate_num_variants(scoreset.get("numVariants")) + # "experiment": { + # }, + experiment. + + # "doiIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string" + # } + # ], + # "pubmedIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string", + # "referenceHtml": "string" + # } + # ], + # "publishedDate": "2022-08-02", + # "creationDate": "2022-08-02", + # "modificationDate": "2022-08-02", + # "createdBy": { + # "orcid_id": "string", + # "firstName": "string", + # "lastName": "string", + # "email": "string" + # }, + # "modifiedBy": { + # "orcid_id": "string", + # "firstName": "string", + # "lastName": "string", + # "email": "string" + # }, + # "targetGene": { + # "name": "string", + # "category": "string", + # "referenceMaps": [ + # { + # "id": 0, + # "genomeId": 0, + # "targetId": 0, + # "isPrimary": true, + # "genome": { + # "shortName": "string", + # "organismName": "string", + # "genomeId": 0, + # "creationDate": "2022-08-02", + # "modificationDate": "2022-08-02", + # "id": 0 + # }, + # "creationDate": "2022-08-02", + # "modificationDate": "2022-08-02" + # } + # ], + # "wtSequence": { + # "sequenceType": "string", + # "sequence": "string" + # } + # }, + # "datasetColumns": {}, + # "private": true + # } From 1b1d3f1162ee69da6b375e00cf9cd81bc27ed7da Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:39:15 -0700 Subject: [PATCH 389/877] reformat --- mavecore/validation_new/summary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 15e14e4..f9fb97d 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -1,4 +1,6 @@ from exceptions import ValidationError + + def validate_title(title): """ Validates a title of an experiment set, an experiment, or a scoreset. From 30eb50454e4dc2095fa7a6c24b3ca3c630d2248c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:39:41 -0700 Subject: [PATCH 390/877] reformat docstrings and spacing --- mavecore/validation_new/summary.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index f9fb97d..57b7624 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -12,13 +12,15 @@ def validate_title(title): Raises: ______ - ValidationError if the title is not valid. + ValidationError + If the title is not valid. """ # check if title is a string if type(title) != str: raise ValidationError("The title must be a string.") # check that title is not too long + def validate_short_description(shortDescription): """ Validates the short description of an experiment set, an experiment, or a scoreset. @@ -30,7 +32,8 @@ def validate_short_description(shortDescription): Raises: ______ - ValidationError if the short description is too long or is not a string. + ValidationError + If the short description is too long or is not a string. """ # check if short description is a string if type(shortDescription) != str: raise ValidationError("The short description must be a string.") @@ -50,7 +53,8 @@ def validate_abstract(abstractText): Raises: ______ - ValidationError if the abstract is too long or is not a string. + ValidationError + If the abstract is too long or is not a string. """ # check if short description is a string if type(abstractText) != str: raise ValidationError("The abstract must be a string.") @@ -59,6 +63,7 @@ def validate_abstract(abstractText): count = len(abstractText.split(" ")) if count > 200: raise ValidationError("The abstract must be less than or equal to 200 words.") + def validate_methods(methodText): """ Validates the methods of an experiment set, an experiment, or a scoreset. @@ -70,7 +75,8 @@ def validate_methods(methodText): Raises: ______ - ValidationError if the abstract is too long or is not a string. + ValidationError + If the methods are too long or is not a string. """ # check if short description is a string if type(methodText) != str: raise ValidationError("The methods must be a string.") @@ -79,6 +85,7 @@ def validate_methods(methodText): count = len(methodText.split(" ")) if count > 200: raise ValidationError("The methods must be less than or equal to 200 words.") + def validate_keywords(keywords): """ Validates the methods of an experiment set, an experiment, or a scoreset. @@ -90,7 +97,8 @@ def validate_keywords(keywords): Raises: ______ - ValidationError if the abstract is too long or is not a string. + ValidationError + If the keywords object is not a list of strings. """ "keywords": [ def validate_num_scoresets(numScoresets): From 1ad72948c0c2399331f75699d087293bd6127060 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:39:52 -0700 Subject: [PATCH 391/877] reformat spacing --- mavecore/validation_new/summary.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 57b7624..7c036c9 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -42,6 +42,7 @@ def validate_short_description(shortDescription): count = len(shortDescription.split(" ")) if count > 50: raise ValidationError("The short description must be less than or equal to 50 words.") + def validate_abstract(abstractText): """ Validates the abstract of an experiment set, an experiment, or a scoreset. From 91f8ca0203a0488423dbd77a2694082d6a1b19d2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:40:07 -0700 Subject: [PATCH 392/877] validate keyword object type --- mavecore/validation_new/summary.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index 7c036c9..e4e1c9f 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -101,5 +101,8 @@ def validate_keywords(keywords): ValidationError If the keywords object is not a list of strings. """ - "keywords": [ + # check keywords type + if type(keywords) != list[str]: raise ValidationError("The keywords must be a list of strings.") + + def validate_num_scoresets(numScoresets): From 6d9798cadab76f92379fcae2fd4ff4c6e6aa6f58 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 11:40:28 -0700 Subject: [PATCH 393/877] declare summary validation functions --- mavecore/validation_new/summary.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py index e4e1c9f..238cfa4 100644 --- a/mavecore/validation_new/summary.py +++ b/mavecore/validation_new/summary.py @@ -106,3 +106,12 @@ def validate_keywords(keywords): def validate_num_scoresets(numScoresets): + return + + +def validate_num_variants(numVariants): + return + + +def validate_dataset_columns(datasetColumns): + return From c7ec6ef4c2e93be44e5dce19756970a7331b58e7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:05:57 -0700 Subject: [PATCH 394/877] write custom object type validation with specific errors --- mavecore/validation_new/type.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 mavecore/validation_new/type.py diff --git a/mavecore/validation_new/type.py b/mavecore/validation_new/type.py new file mode 100644 index 0000000..ef696e7 --- /dev/null +++ b/mavecore/validation_new/type.py @@ -0,0 +1,17 @@ +from exceptions import ValidationError + + +def is_string(string): + if type(string) != string: raise ValidationError("{} must be a string.".format(string)) + + +def is_list(lst): + if type(lst) != lst: raise ValidationError("{} must be a list.".format(lst)) + + +def is_dictionary(dictionary): + if type(dictionary) != dictionary: raise ValidationError("{} must be a dictionary.".format(dictionary)) + + +def is_boolean(boolean): + if type(boolean) != boolean: raise ValidationError("{} must be a boolean value.".format(boolean)) \ No newline at end of file From 65be3cd2d49d55e925f6e581970897d9e6cfd211 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:06:26 -0700 Subject: [PATCH 395/877] write user validation and docstrings --- mavecore/validation_new/user.py | 95 ++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/user.py b/mavecore/validation_new/user.py index 27c25dd..deb70ec 100644 --- a/mavecore/validation_new/user.py +++ b/mavecore/validation_new/user.py @@ -1,4 +1,97 @@ +from exceptions import ValidationError +from type import * + + +def validate_user(userId): + """ + This function validates a user ID. + + Parameters: + __________ + id: dict + The user ID as a dictionary of user attributes + + Raises: + ______ + ValidationError + If any of the user attributes are found to be invalid. + """ + # check id type + is_dictionary(userId) + # run additional validation + validate_orcid_id(userId.get("orcid_id")) + validate_first_name(userId.get("first_name")) + validate_last_name(userId.get("lastName")) + validate_email(userId.get("email")) + + def validate_orcid_id(orcid_id): + """ + Validates ORCID ID. + + Parameters: + __________ + orcid_id: str + The user's ORCID ID. + + Raises: + ______ + ValidationError + If the user's ORCID ID is not valid. + """ + # check type + is_string(orcid_id) + + def validate_first_name(firstName): + """ + Validates user's first name. + + Parameters: + __________ + firstName: str + The user's first name. + + Raises: + ______ + ValidationError + If the user's first name is not a string. + """ + # check type + is_string(firstName) + + def validate_last_name(lastName): -def validate_email(email): \ No newline at end of file + """ + Validates user's last name. + + Parameters: + __________ + lastName: str + The user's last name. + + Raises: + ______ + ValidationError + If the user's last name is not a string. + """ + # check type + is_string(lastName) + + +def validate_email(email): + """ + Validates user's email. + + Parameters: + __________ + email: str + The user's email. + + Raises: + ______ + ValidationError + If the user's email is not valid. + """ + # check type + is_string(email) From 47c04d9b31345e9613510afe30553d9f0a3fd010 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:06:53 -0700 Subject: [PATCH 396/877] define target gene validation functions and write docstrings --- mavecore/validation_new/target.py | 100 ++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index 40d5a74..af54731 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -1,26 +1,74 @@ -"targetGene": { - "name": "string", - "category": "string", - "referenceMaps": [ - { - "id": 0, - "genomeId": 0, - "targetId": 0, - "isPrimary": true, - "genome": { - "shortName": "string", - "organismName": "string", - "genomeId": 0, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "id": 0 - }, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02" - } - ], - "wtSequence": { - "sequenceType": "string", - "sequence": "string" - } - }, \ No newline at end of file +from exceptions import ValidationError +from type import * + + +def validate_target_gene(targetGene): + """ + Validates target gene represented as a dictionary. + + Parameters: + __________ + targetGene: dict + The target gene to be validated + + Raises: + ______ + ValidationError + If the target gene is not represented as a dictionary or if any of the key value pairs are invalid. + """ + # "targetGene": { + # "name": "string", + # "category": "string", + # "referenceMaps": [ + # ], + # "wtSequence": { + # "sequenceType": "string", + # "sequence": "string" + # } + # }, + +def validate_name(name): + is_string(name) + + +def validate_category(category): + is_string(category) + + +def validate_reference_maps(referenceMaps): + """ + Validates reference maps for the target gene. + + Parameters: + __________ + referenceMaps: list[dict] + The list of reference maps to be validated + + Raises: + ______ + ValidationError + If the referenceMaps are not a list of dictionaries + or if any of the key value pairs in the dictionary are invalid + """ + is_list(referenceMaps) + +# { +# "id": 0, +# "genomeId": 0, +# "targetId": 0, +# "isPrimary": true, +# "genome": { +# "shortName": "string", +# "organismName": "string", +# "genomeId": 0, +# "creationDate": "2022-08-02", +# "modificationDate": "2022-08-02", +# "id": 0 +# }, +# "creationDate": "2022-08-02", +# "modificationDate": "2022-08-02" +# } + + + +def validate_wt_sequence(wtSequence): \ No newline at end of file From aaa9aae585111802ac568f90fd513d06fbfea14d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:01 -0700 Subject: [PATCH 397/877] edit imports --- mavecore/validation_new/scoreset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index fddd618..db1197a 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -1,5 +1,6 @@ from exceptions import ValidationError -import urn, summary, metadata, general, experiment +from type import * +import urn, summary, metadata, general, experiment, identifiers, user, target def validate_scoreset(scoreset, files): From 6d9721cbe231f56f112933af4a6e9f6012dc9210 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:11 -0700 Subject: [PATCH 398/877] change type validation --- mavecore/validation_new/scoreset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index db1197a..0e187c3 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -20,7 +20,7 @@ def validate_scoreset(scoreset, files): If scoreset is not a dictionary or if any additional validation fails. """ # first validate that scoreset is a dictionary - if type(scoreset) != dict: raise ValidationError("The scoreset must be a dictionary.") + is_dictionary(scoreset) # { # "urn": "string", # "title": "string", From b913773e1a1ffd34b97102bef89c7eb508963473 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:28 -0700 Subject: [PATCH 399/877] validate experiment associated with scoreset --- mavecore/validation_new/scoreset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 0e187c3..c2a7f69 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -48,8 +48,7 @@ def validate_scoreset(scoreset, files): summary.validate_num_variants(scoreset.get("numVariants")) # "experiment": { # }, - experiment. - + experiment.validate_experiemnt(scoreset.get("experiment")) # "doiIdentifiers": [ # { # "identifier": "string", From 42a6e88c03be59d502effb64489a9550e41fca3e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:38 -0700 Subject: [PATCH 400/877] delete comments --- mavecore/validation_new/scoreset.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index c2a7f69..75779a0 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -51,17 +51,10 @@ def validate_scoreset(scoreset, files): experiment.validate_experiemnt(scoreset.get("experiment")) # "doiIdentifiers": [ # { - # "identifier": "string", - # "id": 0, - # "url": "string" # } # ], # "pubmedIdentifiers": [ # { - # "identifier": "string", - # "id": 0, - # "url": "string", - # "referenceHtml": "string" # } # ], # "publishedDate": "2022-08-02", From 3dcafeebae34381721bed2917ca21d80de231935 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:50 -0700 Subject: [PATCH 401/877] validate identifiers within scoreset --- mavecore/validation_new/scoreset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 75779a0..9cc6dcd 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -57,6 +57,8 @@ def validate_scoreset(scoreset, files): # { # } # ], + identifiers.validate_doi_identifiers(scoreset.get("doiIdentifiers")) + identifiers.validate_pubmed_identifiers(scoreset.get("pubmedIdentifiers")) # "publishedDate": "2022-08-02", # "creationDate": "2022-08-02", # "modificationDate": "2022-08-02", From 765898eecc92e55141ccbc0df4ae720e475c0aef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:07:59 -0700 Subject: [PATCH 402/877] validate dates within scoreset --- mavecore/validation_new/scoreset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 9cc6dcd..3c19247 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -62,6 +62,9 @@ def validate_scoreset(scoreset, files): # "publishedDate": "2022-08-02", # "creationDate": "2022-08-02", # "modificationDate": "2022-08-02", + general.validate_date(scoreset.get("publishedDate")) + general.validate_date(scoreset.get("creationDate")) + general.validate_date(scoreset.get("modificationDate")) # "createdBy": { # "orcid_id": "string", # "firstName": "string", From 20a50e61a3cbb330fa6b84d9b771356d7436b253 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:08:05 -0700 Subject: [PATCH 403/877] validate users within scoreset --- mavecore/validation_new/scoreset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 3c19247..c1b3c70 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -77,6 +77,8 @@ def validate_scoreset(scoreset, files): # "lastName": "string", # "email": "string" # }, + user.validate_user(scoreset.get("createdBy")) + user.validate_user(scoreset.get("modifiedBy")) # "targetGene": { # "name": "string", # "category": "string", From 6dc8cf7b514d0c0fce7012971eb1a05644ee5de5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:08:14 -0700 Subject: [PATCH 404/877] delete comments --- mavecore/validation_new/scoreset.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index c1b3c70..7b20fd3 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -80,30 +80,6 @@ def validate_scoreset(scoreset, files): user.validate_user(scoreset.get("createdBy")) user.validate_user(scoreset.get("modifiedBy")) # "targetGene": { - # "name": "string", - # "category": "string", - # "referenceMaps": [ - # { - # "id": 0, - # "genomeId": 0, - # "targetId": 0, - # "isPrimary": true, - # "genome": { - # "shortName": "string", - # "organismName": "string", - # "genomeId": 0, - # "creationDate": "2022-08-02", - # "modificationDate": "2022-08-02", - # "id": 0 - # }, - # "creationDate": "2022-08-02", - # "modificationDate": "2022-08-02" - # } - # ], - # "wtSequence": { - # "sequenceType": "string", - # "sequence": "string" - # } # }, # "datasetColumns": {}, # "private": true From ce75e958ed0108db8486aedd122f2182927f9319 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:08:31 -0700 Subject: [PATCH 405/877] validate target gene associated with scoreset --- mavecore/validation_new/scoreset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index 7b20fd3..d767775 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -81,6 +81,7 @@ def validate_scoreset(scoreset, files): user.validate_user(scoreset.get("modifiedBy")) # "targetGene": { # }, + target.validate_target_gene(scoreset.get("targetGene")) # "datasetColumns": {}, # "private": true # } From b641e9e1a2365656f77cf1eb2d6ec3f60bcec337 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:08:52 -0700 Subject: [PATCH 406/877] validate summary and general scoreset information --- mavecore/validation_new/scoreset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py index d767775..be7584d 100644 --- a/mavecore/validation_new/scoreset.py +++ b/mavecore/validation_new/scoreset.py @@ -83,5 +83,7 @@ def validate_scoreset(scoreset, files): # }, target.validate_target_gene(scoreset.get("targetGene")) # "datasetColumns": {}, + summary.validate_dataset_columns(scoreset.get("datasetColumns")) # "private": true + general.validate_private(scoreset.get("private")) # } From 400009781a0ae16e6356002db720bd279f94fc6f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:09:13 -0700 Subject: [PATCH 407/877] define and write docstrings for identifiers --- mavecore/validation_new/identifiers.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mavecore/validation_new/identifiers.py b/mavecore/validation_new/identifiers.py index f0dba18..2c3f89f 100644 --- a/mavecore/validation_new/identifiers.py +++ b/mavecore/validation_new/identifiers.py @@ -1,3 +1,17 @@ +def validate_doi_identifiers(doiIdentifiers): + """ + This function validates a list of DOI identifiers represented as dictionaries. + + Parameters: + __________ + doiIdentifiers: list[dict] + The DOI identifiers that need to be validated + + Raises: + ______ + ValidationError + If any identifier is found to be invalid or if doiIdentifiers is not a list of dictionaries. + """ "doiIdentifiers": [ { "identifier": "string", @@ -5,6 +19,22 @@ "url": "string" } ], + + +def validate_pubmed_identifiers(pubmedIdentifiers): + """ + This function validates a list of PubMed identifiers represented as dictionaries. + + Parameters: + __________ + pubmedIdentifiers: list[dict] + The PubMed identifiers that need to be validated + + Raises: + ______ + ValidationError + If any identifier is found to be invalid or if pubMed Identifiers is not a list of dictionaries. + """ "pubmedIdentifiers": [ { "identifier": "string", From 09ad4b3c612d70d32382cc60e262e4bff7b5f078 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:09:22 -0700 Subject: [PATCH 408/877] edit imports --- mavecore/validation_new/general.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index fa30955..75073d9 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -1,3 +1,7 @@ +from exceptions import ValidationError +import datetime + + def validate_data_usage_policy(dataUsagePolicy): """ Validates From c653118ded3137636b924fc10089aabc05527b87 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:09:30 -0700 Subject: [PATCH 409/877] reformat --- mavecore/validation_new/general.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index 75073d9..d9136a0 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -8,18 +8,24 @@ def validate_data_usage_policy(dataUsagePolicy): :param dataUsagePolicy: :return: """ + + def validate_license_id(licenseId): """ :param licenseId: :return: """ + + def validate_replaces_id(replacesId): """ :param replacesId: :return: """ + + def validate_processing_state(processingState): "creationDate": "2022-08-02", "modificationDate": "2022-08-02", From 486c11dea7fcb8c3d461f5d7617fb00fe059f7e5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:09:54 -0700 Subject: [PATCH 410/877] start docstring for processing state validation --- mavecore/validation_new/general.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index d9136a0..d5bb172 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -27,9 +27,13 @@ def validate_replaces_id(replacesId): def validate_processing_state(processingState): - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "publishedDate": "2022-08-02", + """ + + :param processingState: + :return: + """ + return + def validate_date(date): From bece28c4ed76072d2bbd13ced7f206c4fcd25142 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 13:10:41 -0700 Subject: [PATCH 411/877] write docstring for date validation --- mavecore/validation_new/general.py | 35 +++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py index d5bb172..becaeb7 100644 --- a/mavecore/validation_new/general.py +++ b/mavecore/validation_new/general.py @@ -36,5 +36,38 @@ def validate_processing_state(processingState): def validate_date(date): + """ + Validates a date such as creation date, modification date and published date. + + Parameters: + __________ + date: str + The date to be validated. + + Raises: + ______ + ValidationError + If the date is in the wrong format. + """ + template = '%Y-%m-%d' + try: + datetime.datetime.strptime(date, template) + except ValidationError: + print("Date should be formatted as YYYY-MM-DD") + -"private": true \ No newline at end of file +def validate_private(private): + """ + Validate private attribute. + + Parameters: + __________ + private: bool + The boolean private attribute to be validated. + + Raises: + ______ + ValidationError + If the private attribute is not a bool. + """ + if type(private) != bool: raise ValidationError("The private attribute should be of type boolean.") \ No newline at end of file From a3295cda89fd393064539916eb941c6c0f43880e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:21:30 -0700 Subject: [PATCH 412/877] edit imports, fix typo in function signature --- mavecore/validation_new/experiment.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 891a782..7e459be 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -1,4 +1,8 @@ -def validate_experiemnt(experiment): +from type import * +import summary, metadata, urn, user, general, identifiers + + +def validate_experiment(experiment): """ This function validates an experiment. From 75e3a2c9c5b54fc0fba0e4dabfa3588825eaa111 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:21:42 -0700 Subject: [PATCH 413/877] validate type --- mavecore/validation_new/experiment.py | 49 ++------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 7e459be..f51453c 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -16,53 +16,8 @@ def validate_experiment(experiment): ValidationError If the experiment is not a dictionary or if any key:value pair in the experiment is not valid. """ -[ - { - "title": "string", - "shortDescription": "string", - "abstractText": "string", - "methodText": "string", - "extraMetadata": {}, - "keywords": [ - "string" - ], - "urn": "string", - "numScoresets": 0, - "createdBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "modifiedBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "creationDate": "2022-08-02", - "modificationDate": "2022-08-02", - "publishedDate": "2022-08-02", - "experimentSetUrn": "string", - "doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } - ], - "processingState": "string" - } -] - + # check type + is_dictionary(experiment) # "experiment": { # "title": "string", # "shortDescription": "string", From 6923f075ba02118d42f21a0e65e3ea4a358ff3fa Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:02 -0700 Subject: [PATCH 414/877] validate experiment summary information --- mavecore/validation_new/experiment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index f51453c..8ec1056 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -27,8 +27,10 @@ def validate_experiment(experiment): # "keywords": [ # "string" # ], + summary.validate_keywords(experiment.get("keywords")) # "urn": "string", # "numScoresets": 0, + summary.validate_num_scoresets(experiment.get("numScoresets")) # "createdBy": { # "orcid_id": "string", # "firstName": "string", From 284258ab32a1213daf08c86e3c99b2ec20891e4f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:14 -0700 Subject: [PATCH 415/877] validate experiment summary information --- mavecore/validation_new/experiment.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 8ec1056..5cade50 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -23,6 +23,10 @@ def validate_experiment(experiment): # "shortDescription": "string", # "abstractText": "string", # "methodText": "string", + summary.validate_title(experiment.get("title")) + summary.validate_short_description(experiment.get("shortDescription")) + summary.validate_abstract(experiment.get("abstractText")) + summary.validate_methods(experiment.get("methodText")) # "extraMetadata": {}, # "keywords": [ # "string" From d4d5c4262db8e5704fb561e9ae11ac9daa92d0a2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:24 -0700 Subject: [PATCH 416/877] validate experiment metadata information --- mavecore/validation_new/experiment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 5cade50..21b7339 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -28,6 +28,7 @@ def validate_experiment(experiment): summary.validate_abstract(experiment.get("abstractText")) summary.validate_methods(experiment.get("methodText")) # "extraMetadata": {}, + metadata.validate_metadata(experiment.get("extraMetadata")) # "keywords": [ # "string" # ], From 441a90443ab5e2b0650228847ccbc1edd4a1134b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:30 -0700 Subject: [PATCH 417/877] validate experiment urn information --- mavecore/validation_new/experiment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 21b7339..5ae586c 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -34,6 +34,7 @@ def validate_experiment(experiment): # ], summary.validate_keywords(experiment.get("keywords")) # "urn": "string", + urn.validate_experiment_urn(experiment.get("urn")) # "numScoresets": 0, summary.validate_num_scoresets(experiment.get("numScoresets")) # "createdBy": { From 3aa2d929cd6d17b5d7ca783329deed20f962430c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:37 -0700 Subject: [PATCH 418/877] validate experiment urn information --- mavecore/validation_new/experiment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 5ae586c..c32db72 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -53,6 +53,7 @@ def validate_experiment(experiment): # "modificationDate": "2022-08-02", # "publishedDate": "2022-08-02", # "experimentSetUrn": "string", + urn.validate_experimentset_urn(experiment.get("experimentSetUrn")) # "doiIdentifiers": [ # { # "identifier": "string", From 2c0662ab8e98986b1721f72801a8689da3eb9cc2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:43 -0700 Subject: [PATCH 419/877] validate experiment user information --- mavecore/validation_new/experiment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index c32db72..0a192ef 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -49,6 +49,8 @@ def validate_experiment(experiment): # "lastName": "string", # "email": "string" # }, + user.validate_user(experiment.get("createdBy")) + user.validate_user(experiment.get("modifiedBy")) # "creationDate": "2022-08-02", # "modificationDate": "2022-08-02", # "publishedDate": "2022-08-02", From 03e4a6a63a55b8f99cc259f4de4a86e08d4addc1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:50 -0700 Subject: [PATCH 420/877] validate experiment general information --- mavecore/validation_new/experiment.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 0a192ef..1d3e1d5 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -54,6 +54,9 @@ def validate_experiment(experiment): # "creationDate": "2022-08-02", # "modificationDate": "2022-08-02", # "publishedDate": "2022-08-02", + general.validate_date(experiment.get("creationDate")) + general.validate_date(experiment.get("modificationDate")) + general.validate_date(experiment.get("publishedDate")) # "experimentSetUrn": "string", urn.validate_experimentset_urn(experiment.get("experimentSetUrn")) # "doiIdentifiers": [ From 9d9fa97f3a14a22eb0633f48b56e5fc875d8540c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:22:54 -0700 Subject: [PATCH 421/877] validate experiment general information --- mavecore/validation_new/experiment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index 1d3e1d5..f3f6976 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -75,4 +75,5 @@ def validate_experiment(experiment): # } # ], # "processingState": "string" + general.validate_processing_state(experiment.get("processingState")) # }, \ No newline at end of file From 820972f826fd33598646692ea0f57cbb993077e6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:23:04 -0700 Subject: [PATCH 422/877] validate experiment identifier information --- mavecore/validation_new/experiment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py index f3f6976..06ca2ca 100644 --- a/mavecore/validation_new/experiment.py +++ b/mavecore/validation_new/experiment.py @@ -74,6 +74,8 @@ def validate_experiment(experiment): # "referenceHtml": "string" # } # ], + identifiers.validate_doi_identifiers(experiment.get("doiIdentifiers")) + identifiers.validate_pubmed_identifiers(experiment.get("pubmedIdentifiers")) # "processingState": "string" general.validate_processing_state(experiment.get("processingState")) # }, \ No newline at end of file From 1de2cab76b791dc41b4fa64f341afdc3722c27e1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:23:12 -0700 Subject: [PATCH 423/877] validate type --- mavecore/validation_new/target.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index af54731..d494b46 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -16,6 +16,7 @@ def validate_target_gene(targetGene): ValidationError If the target gene is not represented as a dictionary or if any of the key value pairs are invalid. """ + is_dictionary(targetGene) # "targetGene": { # "name": "string", # "category": "string", From d141b231addb6f986e15a9975dd2b7ffe9d9d145 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:23:19 -0700 Subject: [PATCH 424/877] validate type --- mavecore/validation_new/target.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index d494b46..a5da77e 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -52,6 +52,8 @@ def validate_reference_maps(referenceMaps): or if any of the key value pairs in the dictionary are invalid """ is_list(referenceMaps) + is_integer(referenceMaps.get("id")) + is_integer(referenceMaps.get("genomeId")) # { # "id": 0, From a4a677c6e33d46365ae0c47aa413cd5ffa3d19c4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:23:47 -0700 Subject: [PATCH 425/877] add custom type validation --- mavecore/validation_new/type.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mavecore/validation_new/type.py b/mavecore/validation_new/type.py index ef696e7..7e0f56b 100644 --- a/mavecore/validation_new/type.py +++ b/mavecore/validation_new/type.py @@ -1,16 +1,26 @@ from exceptions import ValidationError -def is_string(string): - if type(string) != string: raise ValidationError("{} must be a string.".format(string)) +def is_none(item): + if item is None: raise ValidationError("{} is a required attribute.".format(item)) +def is_integer(item): + if type(item) != int: raise ValidationError("{} must be a string.".format(item)) + + +def is_string(item): + if type(item) != item: raise ValidationError("{} must be a string.".format(item)) + def is_list(lst): if type(lst) != lst: raise ValidationError("{} must be a list.".format(lst)) +def is_list(item): + if type(item) != item: raise ValidationError("{} must be a list.".format(item)) + -def is_dictionary(dictionary): - if type(dictionary) != dictionary: raise ValidationError("{} must be a dictionary.".format(dictionary)) +def is_dictionary(item): + if type(item) != item: raise ValidationError("{} must be a dictionary.".format(item)) def is_boolean(boolean): From 212054363918032062f5d17b18a638bf6075047d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:23:55 -0700 Subject: [PATCH 426/877] reformat --- mavecore/validation_new/type.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/validation_new/type.py b/mavecore/validation_new/type.py index 7e0f56b..2ad6d15 100644 --- a/mavecore/validation_new/type.py +++ b/mavecore/validation_new/type.py @@ -12,8 +12,6 @@ def is_integer(item): def is_string(item): if type(item) != item: raise ValidationError("{} must be a string.".format(item)) -def is_list(lst): - if type(lst) != lst: raise ValidationError("{} must be a list.".format(lst)) def is_list(item): if type(item) != item: raise ValidationError("{} must be a list.".format(item)) From 078f457462a11932a3dc077324c3fd2f030be6ce Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:39:46 -0700 Subject: [PATCH 427/877] note identifier attributes --- mavecore/validation_new/identifiers.py | 30 ++++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mavecore/validation_new/identifiers.py b/mavecore/validation_new/identifiers.py index 2c3f89f..3173c49 100644 --- a/mavecore/validation_new/identifiers.py +++ b/mavecore/validation_new/identifiers.py @@ -12,13 +12,14 @@ def validate_doi_identifiers(doiIdentifiers): ValidationError If any identifier is found to be invalid or if doiIdentifiers is not a list of dictionaries. """ -"doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], + return +#"doiIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string" + # } + # ], def validate_pubmed_identifiers(pubmedIdentifiers): @@ -35,10 +36,11 @@ def validate_pubmed_identifiers(pubmedIdentifiers): ValidationError If any identifier is found to be invalid or if pubMed Identifiers is not a list of dictionaries. """ - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } \ No newline at end of file + #"pubmedIdentifiers": [ + # { + # "identifier": "string", + # "id": 0, + # "url": "string", + # "referenceHtml": "string" + #} + return \ No newline at end of file From e95abd22ef00dbe3c60603eb42c271bc0ef82cdd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:39:53 -0700 Subject: [PATCH 428/877] reformat --- mavecore/validation_new/target.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index a5da77e..0a44a6c 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -28,6 +28,7 @@ def validate_target_gene(targetGene): # } # }, + def validate_name(name): is_string(name) From 00e62f0583c236dd57ec8adb2f231bbe6be23032 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:39:57 -0700 Subject: [PATCH 429/877] reformat --- mavecore/validation_new/target.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index 0a44a6c..7fe6e59 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -74,5 +74,4 @@ def validate_reference_maps(referenceMaps): # } - def validate_wt_sequence(wtSequence): \ No newline at end of file From 35fc484b6e1f45697d1428ff633070506d06125c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 3 Aug 2022 14:40:10 -0700 Subject: [PATCH 430/877] add return statement --- mavecore/validation_new/target.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py index 7fe6e59..5ee0184 100644 --- a/mavecore/validation_new/target.py +++ b/mavecore/validation_new/target.py @@ -74,4 +74,5 @@ def validate_reference_maps(referenceMaps): # } -def validate_wt_sequence(wtSequence): \ No newline at end of file +def validate_wt_sequence(wtSequence): + return \ No newline at end of file From d7fd15af424d9a2099f562a9f98c798a9dd166de Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:06:58 -0700 Subject: [PATCH 431/877] pydantic implementation --- mavecore/models/__init__.py | 0 mavecore/models/dataset.py | 5 ++ mavecore/models/experiment.py | 5 ++ mavecore/models/experimentset.py | 3 + mavecore/models/identifier.py | 0 mavecore/models/scoreset.py | 117 +++++++++++++++++++++++++++++++ mavecore/models/target.py | 0 7 files changed, 130 insertions(+) create mode 100644 mavecore/models/__init__.py create mode 100644 mavecore/models/dataset.py create mode 100644 mavecore/models/experiment.py create mode 100644 mavecore/models/experimentset.py create mode 100644 mavecore/models/identifier.py create mode 100644 mavecore/models/scoreset.py create mode 100644 mavecore/models/target.py diff --git a/mavecore/models/__init__.py b/mavecore/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/models/dataset.py b/mavecore/models/dataset.py new file mode 100644 index 0000000..b069efc --- /dev/null +++ b/mavecore/models/dataset.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + +class DataSet(BaseModel): + dataset = BaseModel + scoreset = BaseModel \ No newline at end of file diff --git a/mavecore/models/experiment.py b/mavecore/models/experiment.py new file mode 100644 index 0000000..8c65251 --- /dev/null +++ b/mavecore/models/experiment.py @@ -0,0 +1,5 @@ +from dataset import DataSet + +class Experiment(DataSet): + + diff --git a/mavecore/models/experimentset.py b/mavecore/models/experimentset.py new file mode 100644 index 0000000..d70e342 --- /dev/null +++ b/mavecore/models/experimentset.py @@ -0,0 +1,3 @@ +from dataset import DataSet + +class ExperimentSet(DataSet): \ No newline at end of file diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py new file mode 100644 index 0000000..e69de29 diff --git a/mavecore/models/scoreset.py b/mavecore/models/scoreset.py new file mode 100644 index 0000000..6fcc809 --- /dev/null +++ b/mavecore/models/scoreset.py @@ -0,0 +1,117 @@ +from dataset import DataSet + +class ScoreSet(DataSet): + urn: str + title: str + methodText: str + abstractText: str + shortDescription: str + extraMetadata: dict + dataUsagePolicy: str + licenceId: int + replacesId: int + keywords: list[str] + numVariants: int + "experiment": { + "title": "string", + "shortDescription": "string", + "abstractText": "string", + "methodText": "string", + "extraMetadata": {}, + "keywords": [ + "string" + ], + "urn": "string", + "numScoresets": 0, + "createdBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "modifiedBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "creationDate": "2022-08-10", + "modificationDate": "2022-08-10", + "publishedDate": "2022-08-10", + "experimentSetUrn": "string", + "doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } + ], + "processingState": "string" + }, + "doiIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string" + } + ], + "pubmedIdentifiers": [ + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } + ], + "publishedDate": "2022-08-10", + "creationDate": "2022-08-10", + "modificationDate": "2022-08-10", + "createdBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "modifiedBy": { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, + "targetGene": { + "name": "string", + "category": "string", + "referenceMaps": [ + { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": true, + "genome": { + "shortName": "string", + "organismName": "string", + "genomeId": 0, + "creationDate": "2022-08-10", + "modificationDate": "2022-08-10", + "id": 0 + }, + "creationDate": "2022-08-10", + "modificationDate": "2022-08-10" + } + ], + "wtSequence": { + "sequenceType": "string", + "sequence": "string" + } + }, + "datasetColumns": {}, + "private": true +} \ No newline at end of file diff --git a/mavecore/models/target.py b/mavecore/models/target.py new file mode 100644 index 0000000..e69de29 From 778b08004e30f6b04d7bfe102384668cee7a6e51 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:11:44 -0700 Subject: [PATCH 432/877] pydantic implementation --- mavecore/models/user.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mavecore/models/user.py diff --git a/mavecore/models/user.py b/mavecore/models/user.py new file mode 100644 index 0000000..e69de29 From 3ab7c2cfc6e2548b410fcb5c2da4c96ad881c69d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:13:32 -0700 Subject: [PATCH 433/877] rename and add additional models --- mavecore/models/data.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 mavecore/models/data.py diff --git a/mavecore/models/data.py b/mavecore/models/data.py new file mode 100644 index 0000000..aa2b39d --- /dev/null +++ b/mavecore/models/data.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +class DataSet(BaseModel): + +class ExperimentSet(DataSet): + +class Experiment(DataSet): + +class ScoreSet(DataSet): \ No newline at end of file From 64f2ded4e907a50572a393a050aa3a29308b5925 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:13:38 -0700 Subject: [PATCH 434/877] rename --- mavecore/models/dataset.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 mavecore/models/dataset.py diff --git a/mavecore/models/dataset.py b/mavecore/models/dataset.py deleted file mode 100644 index b069efc..0000000 --- a/mavecore/models/dataset.py +++ /dev/null @@ -1,5 +0,0 @@ -from pydantic import BaseModel - -class DataSet(BaseModel): - dataset = BaseModel - scoreset = BaseModel \ No newline at end of file From f1018cfde097f7b61cbbb06952e08799f5df335c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:37:42 -0700 Subject: [PATCH 435/877] import datetime and datamodels --- mavecore/models/data.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index aa2b39d..9013fe5 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -1,4 +1,11 @@ from pydantic import BaseModel +from datetime import datetime + +from user import User +from urn import Urn +from identifier import * +from target import Target + class DataSet(BaseModel): From 6472b14380e0f95b902283d07b24542a033541c5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:38:02 -0700 Subject: [PATCH 436/877] set dataset data model --- mavecore/models/data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 9013fe5..5de127b 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -8,6 +8,17 @@ class DataSet(BaseModel): + urn: str + title: str + shortDescription: str + abstractText: str + methodText: str + extraMetadata: dict + creationDate: datetime + publishedDate: datetime + modificationDate: datetime + createdBy: User + modifiedBy: User class ExperimentSet(DataSet): From 84e6d7ed43b9accd40acf9643ea4fba19f3784bd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:38:21 -0700 Subject: [PATCH 437/877] set experiment and experimentset data model --- mavecore/models/data.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 5de127b..210205c 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -23,5 +23,18 @@ class DataSet(BaseModel): class ExperimentSet(DataSet): class Experiment(DataSet): + keywords: list[str] + numScoresets: int + experimentSetUrn: Urn + doiIdentifiers: DoiIdentifier + pubmedIdentifiers: PubmedIdentifier + processingState: str + + +class ExperimentSet(DataSet): + id: int + experiments: list[Experiment] + numExperiments: int + class ScoreSet(DataSet): \ No newline at end of file From fb196e050bfe893fd081c54e670fd06df42b687e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:38:32 -0700 Subject: [PATCH 438/877] set scoreset data model --- mavecore/models/data.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 210205c..69da298 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -37,4 +37,15 @@ class ExperimentSet(DataSet): numExperiments: int -class ScoreSet(DataSet): \ No newline at end of file +class ScoreSet(DataSet): + dataUsagePolicy: str + licenceId: int + replacesId: int + keywords: list[str] + numVariants: int + experiment: Experiment + doiIdentifiers: DoiIdentifier + pubmedIdentifiers: PubmedIdentifier + targetGene: Target + datasetColumns: dict + private: bool From bc636ec2d2827c8607891d32c4803452046bde02 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:38:45 -0700 Subject: [PATCH 439/877] reformat --- mavecore/models/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 69da298..d38eaff 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -20,7 +20,6 @@ class DataSet(BaseModel): createdBy: User modifiedBy: User -class ExperimentSet(DataSet): class Experiment(DataSet): keywords: list[str] From 4c80fabee92160c3bf0dd26019bd9f9b0d0aa8d9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:39:09 -0700 Subject: [PATCH 440/877] declare identifier data models --- mavecore/models/identifier.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index e69de29..9563667 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +class Identifier(BaseModel): + +class DoiIdentifier(Identifier): + { + "identifier": "string", + "id": 0, + "url": "string" + } + +class PubmedIdentifier(Identifier): + { + "identifier": "string", + "id": 0, + "url": "string", + "referenceHtml": "string" + } \ No newline at end of file From afa7030508c85b45d69e5cb6df38709ffe2557e4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:39:21 -0700 Subject: [PATCH 441/877] delete --- mavecore/models/scoreset.py | 114 ------------------------------------ 1 file changed, 114 deletions(-) diff --git a/mavecore/models/scoreset.py b/mavecore/models/scoreset.py index 6fcc809..6bd66b6 100644 --- a/mavecore/models/scoreset.py +++ b/mavecore/models/scoreset.py @@ -1,117 +1,3 @@ from dataset import DataSet class ScoreSet(DataSet): - urn: str - title: str - methodText: str - abstractText: str - shortDescription: str - extraMetadata: dict - dataUsagePolicy: str - licenceId: int - replacesId: int - keywords: list[str] - numVariants: int - "experiment": { - "title": "string", - "shortDescription": "string", - "abstractText": "string", - "methodText": "string", - "extraMetadata": {}, - "keywords": [ - "string" - ], - "urn": "string", - "numScoresets": 0, - "createdBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "modifiedBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "creationDate": "2022-08-10", - "modificationDate": "2022-08-10", - "publishedDate": "2022-08-10", - "experimentSetUrn": "string", - "doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } - ], - "processingState": "string" - }, - "doiIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string" - } - ], - "pubmedIdentifiers": [ - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } - ], - "publishedDate": "2022-08-10", - "creationDate": "2022-08-10", - "modificationDate": "2022-08-10", - "createdBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "modifiedBy": { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, - "targetGene": { - "name": "string", - "category": "string", - "referenceMaps": [ - { - "id": 0, - "genomeId": 0, - "targetId": 0, - "isPrimary": true, - "genome": { - "shortName": "string", - "organismName": "string", - "genomeId": 0, - "creationDate": "2022-08-10", - "modificationDate": "2022-08-10", - "id": 0 - }, - "creationDate": "2022-08-10", - "modificationDate": "2022-08-10" - } - ], - "wtSequence": { - "sequenceType": "string", - "sequence": "string" - } - }, - "datasetColumns": {}, - "private": true -} \ No newline at end of file From c8b041c47154edb8f944c7697d24371afc3731c7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:39:44 -0700 Subject: [PATCH 442/877] declare urn data model --- mavecore/models/urn.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 mavecore/models/urn.py diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py new file mode 100644 index 0000000..2b3a1f6 --- /dev/null +++ b/mavecore/models/urn.py @@ -0,0 +1,3 @@ +from pydantic import BaseModel + +class Urn(BaseModel): \ No newline at end of file From e90db1577bb8ebc182132d232bcd31ca47b646e7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 16:39:52 -0700 Subject: [PATCH 443/877] declare user data model --- mavecore/models/user.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index e69de29..29e3436 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +class User(BaseModel): + { + "orcid_id": "string", + "firstName": "string", + "lastName": "string", + "email": "string" + }, \ No newline at end of file From 6efd9eb45c71c2264f01286597ee9e72022f3657 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:05:49 -0700 Subject: [PATCH 444/877] edit imports --- mavecore/models/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index d38eaff..79cbf77 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -4,7 +4,8 @@ from user import User from urn import Urn from identifier import * -from target import Target +from target import TargetGene +from urn import Urn class DataSet(BaseModel): From a90da97a6f91473200d9d9f0f578ddbb2a00698a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:05:59 -0700 Subject: [PATCH 445/877] edit type --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 79cbf77..f276d15 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -9,7 +9,7 @@ class DataSet(BaseModel): - urn: str + urn: Urn title: str shortDescription: str abstractText: str From 29834f651070deffbf31f4b3acbe2b77b4a0e61b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:07:23 -0700 Subject: [PATCH 446/877] change model name --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index f276d15..da68ae4 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -46,6 +46,6 @@ class ScoreSet(DataSet): experiment: Experiment doiIdentifiers: DoiIdentifier pubmedIdentifiers: PubmedIdentifier - targetGene: Target + targetGene: TargetGene datasetColumns: dict private: bool From 6fc6776e32804e3ab18877ae4d5dd2bfb7cfee85 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:07:37 -0700 Subject: [PATCH 447/877] delete redundant models --- mavecore/models/experiment.py | 5 ----- mavecore/models/experimentset.py | 3 --- mavecore/models/scoreset.py | 3 --- 3 files changed, 11 deletions(-) delete mode 100644 mavecore/models/experiment.py delete mode 100644 mavecore/models/experimentset.py delete mode 100644 mavecore/models/scoreset.py diff --git a/mavecore/models/experiment.py b/mavecore/models/experiment.py deleted file mode 100644 index 8c65251..0000000 --- a/mavecore/models/experiment.py +++ /dev/null @@ -1,5 +0,0 @@ -from dataset import DataSet - -class Experiment(DataSet): - - diff --git a/mavecore/models/experimentset.py b/mavecore/models/experimentset.py deleted file mode 100644 index d70e342..0000000 --- a/mavecore/models/experimentset.py +++ /dev/null @@ -1,3 +0,0 @@ -from dataset import DataSet - -class ExperimentSet(DataSet): \ No newline at end of file diff --git a/mavecore/models/scoreset.py b/mavecore/models/scoreset.py deleted file mode 100644 index 6bd66b6..0000000 --- a/mavecore/models/scoreset.py +++ /dev/null @@ -1,3 +0,0 @@ -from dataset import DataSet - -class ScoreSet(DataSet): From 9a957aa76c19f0746c3a2268c3424bdf15972c6c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:07:50 -0700 Subject: [PATCH 448/877] edit imports --- mavecore/models/urn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py index 2b3a1f6..e24a5c4 100644 --- a/mavecore/models/urn.py +++ b/mavecore/models/urn.py @@ -1,3 +1,6 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator + +from ..validation_new.constants.urn import * + class Urn(BaseModel): \ No newline at end of file From fd5aba88c7dd6799e71b27fc48fe1cac52c4b817 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:08:04 -0700 Subject: [PATCH 449/877] add validation to urn model --- mavecore/models/urn.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py index e24a5c4..66a0794 100644 --- a/mavecore/models/urn.py +++ b/mavecore/models/urn.py @@ -3,4 +3,10 @@ from ..validation_new.constants.urn import * -class Urn(BaseModel): \ No newline at end of file +class Urn(BaseModel): + urn: str + + @validator('urn') + def must_match_regular_expression(cls, v): + if not MAVEDB_ANY_URN_RE.match(v): + raise ValueError("{}'s is not a valid urn.".format(v)) From c402a16cd6476467d1719fde1483e61f4a4ef13c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:08:19 -0700 Subject: [PATCH 450/877] declare and outline target gene model --- mavecore/models/target.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index e69de29..39afb9c 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from map import ReferenceMap +from sequence import WildType + + +class TargetGene(BaseModel): + name: str + category: str + referenceMaps: list[ReferenceMap] + wtSequence: WildType From 5bbd11e4c61d58b13128013e81589f125c45903a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:08:32 -0700 Subject: [PATCH 451/877] declare and outline wild type sequence model --- mavecore/models/sequence.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 mavecore/models/sequence.py diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py new file mode 100644 index 0000000..87c31e8 --- /dev/null +++ b/mavecore/models/sequence.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class WildType(BaseModel): + sequenceType: str + sequence: str From c8d3f0f6532f666395d9fb2649844a03e0abd9fd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:08:44 -0700 Subject: [PATCH 452/877] declare and outline reference map model --- mavecore/models/map.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 mavecore/models/map.py diff --git a/mavecore/models/map.py b/mavecore/models/map.py new file mode 100644 index 0000000..e6f5040 --- /dev/null +++ b/mavecore/models/map.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel +from datetime import datetime + +from genome import Genome + + +class ReferenceMap(BaseModel): + id: int + genomeId: int + targetId: int + isPrimary: bool + genome: Genome + creationDate: datetime + modificationDate: datetime \ No newline at end of file From 999fc5c509140b50c8d184541383f1cf00229aa0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:08:53 -0700 Subject: [PATCH 453/877] edit imports --- mavecore/models/identifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 9563667..448832b 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -1,4 +1,5 @@ -from pydantic import BaseModel +from pydantic import BaseModel, HttpUrl + class Identifier(BaseModel): From df790d839c04c00e3a21154c5e175ac2cb7665c7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:09:14 -0700 Subject: [PATCH 454/877] add attributes to identifier model --- mavecore/models/identifier.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 448832b..a162da0 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -2,6 +2,10 @@ class Identifier(BaseModel): + identifier: str + id: 0 + url: HttpUrl + class DoiIdentifier(Identifier): { From dd37a3b2eea0e7a9f0e45d8017be8f730b5fc71a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:09:28 -0700 Subject: [PATCH 455/877] pass on doi identifier model --- mavecore/models/identifier.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index a162da0..b93f72f 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -8,11 +8,8 @@ class Identifier(BaseModel): class DoiIdentifier(Identifier): - { - "identifier": "string", - "id": 0, - "url": "string" - } + pass + class PubmedIdentifier(Identifier): { From 974c0bfe8f88e7d58665d11e90185f5f686e6a92 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:09:46 -0700 Subject: [PATCH 456/877] add fields to pubmed identifier model --- mavecore/models/identifier.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index b93f72f..94af068 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -12,9 +12,4 @@ class DoiIdentifier(Identifier): class PubmedIdentifier(Identifier): - { - "identifier": "string", - "id": 0, - "url": "string", - "referenceHtml": "string" - } \ No newline at end of file + referenceHtml: str From 495778933903078d69d2098f869cd12c377daf88 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 10 Aug 2022 17:10:03 -0700 Subject: [PATCH 457/877] declare and validate genome model --- mavecore/models/genome.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 mavecore/models/genome.py diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py new file mode 100644 index 0000000..04efd69 --- /dev/null +++ b/mavecore/models/genome.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel +from datetime import datetime + + +class Genome(BaseModel): + shortName: str + organismName: str + genomeId: int + creationDate: datetime + modificationDate: datetime + id: int From 177ac97336328143b127b984e1de42fa3c205259 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:54:17 -0700 Subject: [PATCH 458/877] import typing --- mavecore/models/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index da68ae4..631b4cf 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -1,5 +1,6 @@ from pydantic import BaseModel from datetime import datetime +from typing import List, Dict, Optional from user import User from urn import Urn From 094330f5c14884a61c294fe46d4247ae11ec1934 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:54:28 -0700 Subject: [PATCH 459/877] import identifier models --- mavecore/models/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 631b4cf..60b5c3d 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -3,8 +3,7 @@ from typing import List, Dict, Optional from user import User -from urn import Urn -from identifier import * +from identifier import DoiIdentifier, PubmedIdentifier from target import TargetGene from urn import Urn From f5b5ed3b78d74d8fb06912fcafd75b25a0ebc890 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:54:35 -0700 Subject: [PATCH 460/877] import urn models --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 60b5c3d..25bf785 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -5,7 +5,7 @@ from user import User from identifier import DoiIdentifier, PubmedIdentifier from target import TargetGene -from urn import Urn +from urn import ExperimentUrn, ExperimentSetUrn, ScoreSetUrn class DataSet(BaseModel): From bcc553c1fdf554f36203fecbb4bc983a6f194344 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:54:45 -0700 Subject: [PATCH 461/877] delete urn attribute --- mavecore/models/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 25bf785..1f84037 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -9,7 +9,6 @@ class DataSet(BaseModel): - urn: Urn title: str shortDescription: str abstractText: str From 19d5b24d9886d346e52e019825e46df2d890877b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:55:18 -0700 Subject: [PATCH 462/877] add specific urn attributes and change keyword type --- mavecore/models/data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 1f84037..f3d2820 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -22,7 +22,8 @@ class DataSet(BaseModel): class Experiment(DataSet): - keywords: list[str] + urn: ExperimentUrn + keywords: List[str] numScoresets: int experimentSetUrn: Urn doiIdentifiers: DoiIdentifier @@ -31,12 +32,14 @@ class Experiment(DataSet): class ExperimentSet(DataSet): + urn: ExperimentSetUrn id: int experiments: list[Experiment] numExperiments: int class ScoreSet(DataSet): + urn: ScoreSetUrn dataUsagePolicy: str licenceId: int replacesId: int From 3a900e633c5f1e52ebc3be164c4d52f92bc6da4c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:55:42 -0700 Subject: [PATCH 463/877] mark attributes as optional --- mavecore/models/data.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index f3d2820..7f21275 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -13,21 +13,21 @@ class DataSet(BaseModel): shortDescription: str abstractText: str methodText: str - extraMetadata: dict + extraMetadata: Optional[Dict] creationDate: datetime - publishedDate: datetime - modificationDate: datetime - createdBy: User - modifiedBy: User + publishedDate: Optional[datetime] + modificationDate: Optional[datetime] + createdBy: Optional[User] + modifiedBy: Optional[User] class Experiment(DataSet): urn: ExperimentUrn keywords: List[str] numScoresets: int - experimentSetUrn: Urn - doiIdentifiers: DoiIdentifier - pubmedIdentifiers: PubmedIdentifier + experimentSetUrn: ExperimentSetUrn + doiIdentifiers: Optional[DoiIdentifier] + pubmedIdentifiers: Optional[PubmedIdentifier] processingState: str @@ -42,12 +42,12 @@ class ScoreSet(DataSet): urn: ScoreSetUrn dataUsagePolicy: str licenceId: int - replacesId: int - keywords: list[str] + replacesId: Optional[int] + keywords: Optional[List[str]] numVariants: int experiment: Experiment - doiIdentifiers: DoiIdentifier - pubmedIdentifiers: PubmedIdentifier + doiIdentifiers: Optional[DoiIdentifier] + pubmedIdentifiers: Optional[PubmedIdentifier] targetGene: TargetGene datasetColumns: dict private: bool From d1642b3b8de0812a7e8b5f55a1cd28c2d305bdb0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:55:54 -0700 Subject: [PATCH 464/877] change attribute types --- mavecore/models/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 7f21275..2a1599e 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -34,7 +34,7 @@ class Experiment(DataSet): class ExperimentSet(DataSet): urn: ExperimentSetUrn id: int - experiments: list[Experiment] + experiments: List[Experiment] numExperiments: int @@ -49,5 +49,5 @@ class ScoreSet(DataSet): doiIdentifiers: Optional[DoiIdentifier] pubmedIdentifiers: Optional[PubmedIdentifier] targetGene: TargetGene - datasetColumns: dict + datasetColumns: Dict private: bool From d73f92d80382655e1dfef4219ebc617f99491ae3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:56:02 -0700 Subject: [PATCH 465/877] import optional --- mavecore/models/genome.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py index 04efd69..78c8826 100644 --- a/mavecore/models/genome.py +++ b/mavecore/models/genome.py @@ -1,5 +1,6 @@ from pydantic import BaseModel from datetime import datetime +from typing import Optional class Genome(BaseModel): From c8bcddcd1a1f2274600059d7b73c37c9cedf877d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:56:17 -0700 Subject: [PATCH 466/877] make attribute optional --- mavecore/models/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py index 78c8826..3130833 100644 --- a/mavecore/models/genome.py +++ b/mavecore/models/genome.py @@ -8,5 +8,5 @@ class Genome(BaseModel): organismName: str genomeId: int creationDate: datetime - modificationDate: datetime + modificationDate: Optional[datetime] id: int From ad187818d51dc9bed900dcda9e1a9d70eef11501 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:56:34 -0700 Subject: [PATCH 467/877] import optional --- mavecore/models/map.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index e6f5040..2e14ccd 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -1,5 +1,6 @@ from pydantic import BaseModel from datetime import datetime +from typing import Optional from genome import Genome From 3d40d5359b257c63f3bd4960a1b1ca056df1dff4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:56:44 -0700 Subject: [PATCH 468/877] make attribute optional --- mavecore/models/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index 2e14ccd..2d8f942 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -12,4 +12,4 @@ class ReferenceMap(BaseModel): isPrimary: bool genome: Genome creationDate: datetime - modificationDate: datetime \ No newline at end of file + modificationDate: Optional[datetime] \ No newline at end of file From f774c4aa868e0f749fd292ba8b1e3ab222e0aeb6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:56:55 -0700 Subject: [PATCH 469/877] import list --- mavecore/models/target.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 39afb9c..e500f90 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -1,4 +1,5 @@ from pydantic import BaseModel +from typing import List from map import ReferenceMap from sequence import WildType From 2666e0d0b69761c8533c4c24c967b76371345145 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:57:16 -0700 Subject: [PATCH 470/877] change attribute type --- mavecore/models/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index e500f90..5ef255e 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -8,5 +8,5 @@ class TargetGene(BaseModel): name: str category: str - referenceMaps: list[ReferenceMap] + referenceMaps: List[ReferenceMap] wtSequence: WildType From da6bf8fa4ac2a886c1898c336e5a6eacfe2260e0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:57:28 -0700 Subject: [PATCH 471/877] import validation error --- mavecore/models/urn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py index 66a0794..ce721c1 100644 --- a/mavecore/models/urn.py +++ b/mavecore/models/urn.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, ValidationError, validator from ..validation_new.constants.urn import * +from ..validation_new.exceptions import ValidationError class Urn(BaseModel): From 28d206be13341a86831b7edb7e95786edb32612b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:58:09 -0700 Subject: [PATCH 472/877] add urn specific validation --- mavecore/models/urn.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py index ce721c1..05cf2ac 100644 --- a/mavecore/models/urn.py +++ b/mavecore/models/urn.py @@ -7,7 +7,23 @@ class Urn(BaseModel): urn: str + +class ExperimentUrn(Urn): + @validator('urn') + def must_match_regular_expression(cls, v): + if not (MAVEDB_EXPERIMENTSET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) + + +class ExperimentSetUrn(Urn): + @validator('urn') + def must_match_regular_expression(cls, v): + if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) + + +class ScoreSetUrn(Urn): @validator('urn') def must_match_regular_expression(cls, v): - if not MAVEDB_ANY_URN_RE.match(v): - raise ValueError("{}'s is not a valid urn.".format(v)) + if not (MAVEDB_SCORESET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid score set urn.".format(v)) \ No newline at end of file From 338b9b469cb6baff1e47f9ff17c7fd1f2277d7cb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 11:58:32 -0700 Subject: [PATCH 473/877] add attributes to urn model --- mavecore/models/user.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index 29e3436..dfe188a 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -1,9 +1,8 @@ from pydantic import BaseModel + class User(BaseModel): - { - "orcid_id": "string", - "firstName": "string", - "lastName": "string", - "email": "string" - }, \ No newline at end of file + orcid_id: str + firstName: str + lastName: str + email: str From c8a3d42c888adff8d0a6d66f9fca51716e1a2251 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 12:08:18 -0700 Subject: [PATCH 474/877] edit imports --- mavecore/models/user.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index dfe188a..927e3db 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -1,4 +1,7 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator +import re + +from ..validation_new.exceptions import ValidationError class User(BaseModel): From b22fd742b08d33adf419a0cbd62fe62edba07182 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 12:08:26 -0700 Subject: [PATCH 475/877] validate email --- mavecore/models/user.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index 927e3db..e2b5858 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -9,3 +9,10 @@ class User(BaseModel): firstName: str lastName: str email: str + + @validator('email') + def check_email_has_valid_structure(cls, v): + # regular expression for validating an Email + regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' + if not (re.fullmatch(regex, v)): + raise ValidationError("{}'s is not a valid email.".format(v)) From 4db40184ae2d9f84fb5c1f6bc5e4bc0e5755905a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 12:11:37 -0700 Subject: [PATCH 476/877] initial commit tests --- tests/test_pydantic_validation/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_pydantic_validation/__init__.py diff --git a/tests/test_pydantic_validation/__init__.py b/tests/test_pydantic_validation/__init__.py new file mode 100644 index 0000000..e69de29 From 21e9ae03f699482334167adac5c5482ac1635cbb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:45:27 -0700 Subject: [PATCH 477/877] delete content --- mavecore/__init__.py | 10 -------- .../validation/variant_validators/__init__.py | 24 ------------------- 2 files changed, 34 deletions(-) diff --git a/mavecore/__init__.py b/mavecore/__init__.py index 54c1bf6..8b13789 100644 --- a/mavecore/__init__.py +++ b/mavecore/__init__.py @@ -1,11 +1 @@ -from mavecore.validation.variant_validators import ( - validate_hgvs_string, - validate_variant_json, - validate_columns_match, -) -__all__ = [ - "validate_columns_match", - "validate_variant_json", - "validate_hgvs_string", -] diff --git a/mavecore/validation/variant_validators/__init__.py b/mavecore/validation/variant_validators/__init__.py index 1f7aca1..8b13789 100644 --- a/mavecore/validation/variant_validators/__init__.py +++ b/mavecore/validation/variant_validators/__init__.py @@ -1,25 +1 @@ -from .dataset import MaveDataset, MaveCountsDataset, MaveScoresDataset -from .hgvs import ( - validate_nt_variant, - validate_pro_variant, - validate_splice_variant, - validate_hgvs_string, -) - -from .variant import validate_columns_match, validate_variant_json - -__all__ = [ - "dataset", - "variant", - "hgvs", - "validate_nt_variant", - "validate_splice_variant", - "validate_pro_variant", - "validate_hgvs_string", - "validate_columns_match", - "validate_variant_json", - "MaveCountsDataset", - "MaveScoresDataset", - "MaveDataset", -] From 93cae666995bd961276a0b0a8b027a8b51cee560 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:45:42 -0700 Subject: [PATCH 478/877] edit import --- mavecore/models/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index 2d8f942..98e420e 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -2,7 +2,7 @@ from datetime import datetime from typing import Optional -from genome import Genome +from .genome import Genome class ReferenceMap(BaseModel): From dc63f13a9d481397c82a3b9b3990e5adf433d00b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:45:48 -0700 Subject: [PATCH 479/877] edit import --- mavecore/models/target.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 5ef255e..e8b6ada 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -1,8 +1,8 @@ from pydantic import BaseModel from typing import List -from map import ReferenceMap -from sequence import WildType +from .map import ReferenceMap +from .sequence import WildType class TargetGene(BaseModel): From b66e3fceb5ce74312d7c08e30e7d620721c27018 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:46:04 -0700 Subject: [PATCH 480/877] remove import --- mavecore/models/user.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index e2b5858..ca71f4f 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -1,8 +1,6 @@ from pydantic import BaseModel, ValidationError, validator import re -from ..validation_new.exceptions import ValidationError - class User(BaseModel): orcid_id: str From 1f42dcb59eca049957150186cc18569988736031 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:46:14 -0700 Subject: [PATCH 481/877] edit error type --- mavecore/models/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index ca71f4f..97b6932 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -13,4 +13,4 @@ def check_email_has_valid_structure(cls, v): # regular expression for validating an Email regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' if not (re.fullmatch(regex, v)): - raise ValidationError("{}'s is not a valid email.".format(v)) + raise ValueError("{}'s is not a valid email.".format(v)) From 8292a0d53110dff147f1c87a231b2128ca59e3ef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 13:46:45 -0700 Subject: [PATCH 482/877] write unittests for user model validation --- tests/test_pydantic_validation/user.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/test_pydantic_validation/user.py diff --git a/tests/test_pydantic_validation/user.py b/tests/test_pydantic_validation/user.py new file mode 100644 index 0000000..7731aa8 --- /dev/null +++ b/tests/test_pydantic_validation/user.py @@ -0,0 +1,23 @@ +from unittest import TestCase +from mavecore.models.user import User + + +class Test(TestCase): + def test_valid(self): + user = { + "orcid_id": "idididid", + "firstName": "first", + "lastName": "last", + "email": "firstlast@email.edu", + } + User.parse_obj(user) + + def test_invalid_email(self): + user = { + "orcid_id": "idididid", + "firstName": "first", + "lastName": "last", + "email": "firstlastemail.edu", + } + with self.assertRaises(ValueError): + User.parse_obj(user) From 15efbff4e6f4edc07578772791654d03d7a27ffd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:20:38 -0700 Subject: [PATCH 483/877] refactor rename --- .../__init__.py | 0 tests/{test_pydantic_validation => test_model_validation}/user.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_pydantic_validation => test_model_validation}/__init__.py (100%) rename tests/{test_pydantic_validation => test_model_validation}/user.py (100%) diff --git a/tests/test_pydantic_validation/__init__.py b/tests/test_model_validation/__init__.py similarity index 100% rename from tests/test_pydantic_validation/__init__.py rename to tests/test_model_validation/__init__.py diff --git a/tests/test_pydantic_validation/user.py b/tests/test_model_validation/user.py similarity index 100% rename from tests/test_pydantic_validation/user.py rename to tests/test_model_validation/user.py From 4b125886babe5a22444f942a9a069c6ff7c2701d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:20:54 -0700 Subject: [PATCH 484/877] import typing --- mavecore/models/identifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 94af068..0808317 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -1,4 +1,5 @@ from pydantic import BaseModel, HttpUrl +from typing import Optional class Identifier(BaseModel): From b33ec3dc527d1421ced01b5aaf3cfd312e6314b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:21:09 -0700 Subject: [PATCH 485/877] make attributes for identifiers optional --- mavecore/models/identifier.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 0808317..bb9f3b3 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -4,8 +4,8 @@ class Identifier(BaseModel): identifier: str - id: 0 - url: HttpUrl + id: Optional[0] + url: Optional[HttpUrl] class DoiIdentifier(Identifier): @@ -13,4 +13,4 @@ class DoiIdentifier(Identifier): class PubmedIdentifier(Identifier): - referenceHtml: str + referenceHtml: Optional[str] From 445f4cec7dbe39f5fdccf9c6eb5f4069327f2379 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:21:23 -0700 Subject: [PATCH 486/877] delete file --- mavecore/models/urn.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 mavecore/models/urn.py diff --git a/mavecore/models/urn.py b/mavecore/models/urn.py deleted file mode 100644 index 05cf2ac..0000000 --- a/mavecore/models/urn.py +++ /dev/null @@ -1,29 +0,0 @@ -from pydantic import BaseModel, ValidationError, validator - -from ..validation_new.constants.urn import * -from ..validation_new.exceptions import ValidationError - - -class Urn(BaseModel): - urn: str - - -class ExperimentUrn(Urn): - @validator('urn') - def must_match_regular_expression(cls, v): - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) - - -class ExperimentSetUrn(Urn): - @validator('urn') - def must_match_regular_expression(cls, v): - if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) - - -class ScoreSetUrn(Urn): - @validator('urn') - def must_match_regular_expression(cls, v): - if not (MAVEDB_SCORESET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid score set urn.".format(v)) \ No newline at end of file From e4ddd9d7203f843bdcd6af3633442f253bb840b3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:26:01 -0700 Subject: [PATCH 487/877] add unittests for target model --- tests/test_model_validation/target.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_model_validation/target.py diff --git a/tests/test_model_validation/target.py b/tests/test_model_validation/target.py new file mode 100644 index 0000000..e69de29 From 34f3c906d246645271bec6ffdf4a6bd3dd7f9950 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:35:24 -0700 Subject: [PATCH 488/877] add unittests for genome model --- tests/test_model_validation/genome.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_model_validation/genome.py diff --git a/tests/test_model_validation/genome.py b/tests/test_model_validation/genome.py new file mode 100644 index 0000000..e69de29 From 369b9e915eb7a33bc75026902f8b1fa93e1e2baa Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 15:37:44 -0700 Subject: [PATCH 489/877] add map validation --- tests/test_model_validation/map.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_model_validation/map.py diff --git a/tests/test_model_validation/map.py b/tests/test_model_validation/map.py new file mode 100644 index 0000000..e69de29 From d5f3e002378068465044cf8ed3b2bba3f73b301b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 16:36:27 -0700 Subject: [PATCH 490/877] add sequence validation --- tests/test_model_validation/sequence.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_model_validation/sequence.py diff --git a/tests/test_model_validation/sequence.py b/tests/test_model_validation/sequence.py new file mode 100644 index 0000000..e69de29 From 2020b41d06557bf5428fbb93c98cc617cd043c85 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 16:43:14 -0700 Subject: [PATCH 491/877] add unittesting for identifier validation --- tests/test_model_validation/identifier.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_model_validation/identifier.py diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py new file mode 100644 index 0000000..e69de29 From ef4a0e3c03ba23a08f9f3b0258aa711921706e34 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 11 Aug 2022 16:43:30 -0700 Subject: [PATCH 492/877] add unittesting for data validation --- tests/test_model_validation/data.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/test_model_validation/data.py diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py new file mode 100644 index 0000000..751f24c --- /dev/null +++ b/tests/test_model_validation/data.py @@ -0,0 +1,21 @@ +from unittest import TestCase +from pydantic import BaseModel +from mavecore.models.data import DataSet # data#, genome, identifier, map, sequence, target, urn, user + + +class Test(TestCase): + def test_no_change(self): + """ + class DataSet(BaseModel): + title: str + shortDescription: str + abstractText: str + methodText: str""" + + dictionary ={ + "title": "string", + "shortDescription": "string", + "abstractText": "string", + "methodText": "string", + } + DataSet.parse_obj(dictionary) From 9a2f90788917d971c3979a12694fc42df9a23fc3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:43:29 -0700 Subject: [PATCH 493/877] edit imports --- mavecore/models/data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 2a1599e..ad98009 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -1,11 +1,12 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator from datetime import datetime from typing import List, Dict, Optional -from user import User -from identifier import DoiIdentifier, PubmedIdentifier -from target import TargetGene -from urn import ExperimentUrn, ExperimentSetUrn, ScoreSetUrn +from .user import User +from .identifier import DoiIdentifier, PubmedIdentifier +from .target import TargetGene + +from mavecore.validation_new.constants.urn import * class DataSet(BaseModel): From d33ac639dcc171771728f25c9e28a8a7091221d3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:43:45 -0700 Subject: [PATCH 494/877] mark attributes as optional --- mavecore/models/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index ad98009..3350708 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -15,9 +15,9 @@ class DataSet(BaseModel): abstractText: str methodText: str extraMetadata: Optional[Dict] - creationDate: datetime - publishedDate: Optional[datetime] - modificationDate: Optional[datetime] + creationDate: Optional[str] + publishedDate: Optional[str] + modificationDate: Optional[str] createdBy: Optional[User] modifiedBy: Optional[User] From c27ef83ca0eb69e235083f2e7684e0503fa21f74 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:43:53 -0700 Subject: [PATCH 495/877] validate date --- mavecore/models/data.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 3350708..27f69d9 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -21,6 +21,13 @@ class DataSet(BaseModel): createdBy: Optional[User] modifiedBy: Optional[User] + @validator('creationDate', 'publishedDate', 'modificationDate') + def date_must_match_regex(cls, v): + # regular expression for validating a date + regex = '%Y-%m-%d' + if not bool(datetime.strptime(v, regex)): + raise ValidationError("{}'s is not a valid date.".format(v)) + class Experiment(DataSet): urn: ExperimentUrn From 86905414535a853010f6218291d72651119971e8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:44:15 -0700 Subject: [PATCH 496/877] mark attributes as optional, validate urn --- mavecore/models/data.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 27f69d9..9b8d112 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -30,13 +30,26 @@ def date_must_match_regex(cls, v): class Experiment(DataSet): - urn: ExperimentUrn - keywords: List[str] - numScoresets: int - experimentSetUrn: ExperimentSetUrn - doiIdentifiers: Optional[DoiIdentifier] - pubmedIdentifiers: Optional[PubmedIdentifier] - processingState: str + urn: Optional[str] + keywords: Optional[List[str]] + numScoresets: Optional[int] + experimentSetUrn: Optional[str] + doiIdentifiers: Optional[List[DoiIdentifier]] + pubmedIdentifiers: Optional[List[PubmedIdentifier]] + processingState: Optional[str] + + @validator('urn') + def urn_must_match_regex(cls, v): + regex = MAVEDB_TMP_URN_RE + if not (re.fullmatch(regex, v)): + raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) + #if not (MAVEDB_EXPERIMENTSET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + # raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) + + @validator('experimentSetUrn') + def experiment_set_urn_must_match_regex(cls, v): + if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) class ExperimentSet(DataSet): From ae88f1c73a19d6c9b7bed615ad1568896eb3782d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:44:26 -0700 Subject: [PATCH 497/877] mark attributes as optional --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 9b8d112..094f460 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -53,7 +53,7 @@ def experiment_set_urn_must_match_regex(cls, v): class ExperimentSet(DataSet): - urn: ExperimentSetUrn + urn: Optional[str] id: int experiments: List[Experiment] numExperiments: int From 31ce74e6f05e3c8966b7cbb1093fdb97aa3df3b2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:44:33 -0700 Subject: [PATCH 498/877] validate urn --- mavecore/models/data.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 094f460..7dcffda 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -58,6 +58,11 @@ class ExperimentSet(DataSet): experiments: List[Experiment] numExperiments: int + @validator('urn') + def must_match_regular_expression(cls, v): + if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) + class ScoreSet(DataSet): urn: ScoreSetUrn From 6720408cb97b8abc02f5c326a1cf7fabe18a16af Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:44:46 -0700 Subject: [PATCH 499/877] mark attributes as optional --- mavecore/models/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 7dcffda..dac9e40 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -65,15 +65,15 @@ def must_match_regular_expression(cls, v): class ScoreSet(DataSet): - urn: ScoreSetUrn + urn: Optional[str] dataUsagePolicy: str licenceId: int replacesId: Optional[int] keywords: Optional[List[str]] numVariants: int experiment: Experiment - doiIdentifiers: Optional[DoiIdentifier] - pubmedIdentifiers: Optional[PubmedIdentifier] + doiIdentifiers: Optional[List[DoiIdentifier]] + pubmedIdentifiers: Optional[List[PubmedIdentifier]] targetGene: TargetGene datasetColumns: Dict private: bool From b65690effdb6e8147435f67eec9440bf97dd9647 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:44:53 -0700 Subject: [PATCH 500/877] validate urn --- mavecore/models/data.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index dac9e40..5c4cd5f 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -77,3 +77,8 @@ class ScoreSet(DataSet): targetGene: TargetGene datasetColumns: Dict private: bool + + @validator('urn') + def must_match_regular_expression(cls, v): + if not (MAVEDB_SCORESET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): + raise ValidationError("{}'s is not a valid score set urn.".format(v)) From 712db8450a27a5f71ced6401e91462eb28e22db8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:45:16 -0700 Subject: [PATCH 501/877] edit imports --- tests/test_model_validation/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index 751f24c..2485922 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -1,6 +1,7 @@ from unittest import TestCase -from pydantic import BaseModel -from mavecore.models.data import DataSet # data#, genome, identifier, map, sequence, target, urn, user +from pydantic import ValidationError +from mavecore.models.data import DataSet, Experiment, ExperimentSet, ScoreSet + class Test(TestCase): From 480c6303148391d536a45e93b58456461d1decb8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:45:33 -0700 Subject: [PATCH 502/877] test DataSet model --- tests/test_model_validation/data.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index 2485922..b9332aa 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -3,6 +3,22 @@ from mavecore.models.data import DataSet, Experiment, ExperimentSet, ScoreSet +class TestDataSet(TestCase): + def test_valid_all_fields(self): + user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} + dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "creationDate": "2022-02-02", + "publishedDate": "2022-02-02", + "modificationDate": "2022-02-02", + "createdBy": user, + "modifiedBy": user, + } + DataSet.parse_obj(dataset) class Test(TestCase): def test_no_change(self): From c28a0b6fec3767579b0f7e130acfe8d87e9b10b5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:45:50 -0700 Subject: [PATCH 503/877] test DataSet model excluding optional attributes --- tests/test_model_validation/data.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index b9332aa..176d1c5 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -20,14 +20,14 @@ def test_valid_all_fields(self): } DataSet.parse_obj(dataset) -class Test(TestCase): - def test_no_change(self): - """ - class DataSet(BaseModel): - title: str - shortDescription: str - abstractText: str - methodText: str""" + def test_valid_exclude_optional(self): + dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + } + DataSet.parse_obj(dataset) dictionary ={ "title": "string", From 2613ded458d57b26f695e0f49f43ae14ad4b8787 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:46:52 -0700 Subject: [PATCH 504/877] test DataSet model invalid dates --- tests/test_model_validation/data.py | 64 ++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index 176d1c5..e8c8e3f 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -29,10 +29,64 @@ def test_valid_exclude_optional(self): } DataSet.parse_obj(dataset) - dictionary ={ - "title": "string", - "shortDescription": "string", - "abstractText": "string", - "methodText": "string", + def test_invalid_creation_date(self): + dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "creationDate": "2022-02-02-", + } + with self.assertRaises(ValidationError): + DataSet.parse_obj(dataset) + + def test_invalid_published_date(self): + dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "publishedDate": "2022-02-02-", + } + with self.assertRaises(ValidationError): + DataSet.parse_obj(dataset) + + def test_invalid_modification_date(self): + dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "creationDate": "2022-02-02", + "publishedDate": "2022-02-02", + "modificationDate": "a", + } + with self.assertRaises(ValidationError): + DataSet.parse_obj(dataset) + + +class TestExperiment(TestCase): + def test_valid_all_fields(self): + user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} + doi_identifier = {"identifier": "id"} + pubmed_identifier = {"identifier": "id"} + experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "creationDate": "2022-02-02", + "publishedDate": "2022-02-02", + "modificationDate": "2022-02-02", + "createdBy": user, + "modifiedBy": user, + "urn": "tmp:070b3886-ed72-4ce9-a574-6754ad00310b", + "keywords": ["string"], + "numScoresets": 0, + #"experimentSetUrn": "urn", + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "processingState": "string", } DataSet.parse_obj(dictionary) From 6ebb1bebce1d40c0aaaea156a70dbf25656c2bdd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:47:16 -0700 Subject: [PATCH 505/877] test Experiment model exclude optional attributes --- tests/test_model_validation/data.py | 71 ++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index e8c8e3f..b8f484a 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -89,4 +89,73 @@ def test_valid_all_fields(self): "pubmedIdentifiers": [pubmed_identifier], "processingState": "string", } - DataSet.parse_obj(dictionary) + Experiment.parse_obj(experiment) + + def test_valid_exclude_optional(self): + experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + } + Experiment.parse_obj(experiment) + + +class TestExperimentSet(TestCase): + def test_valid(self): + user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} + experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} + experimentset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "creationDate": "2022-02-02", + "publishedDate": "2022-02-02", + "modificationDate": "2022-02-02", + "createdBy": user, + "modifiedBy": user, + #"urn": "urn", + "id": 0, + "experiments": [experiment], + "numExperiments": 1, + } + ExperimentSet.parse_obj(experimentset) + + +class TestScoreSet(TestCase): + def test_valid_all_fields(self): + user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} + experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} + doi_identifier = {"identifier": "id"} + pubmed_identifier = {"identifier": "id"} + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} + sequence = {"sequenceType": "DNA", "sequence": "ATCG"} + target = {"name": "name", "category": "Protein coding", "referenceMaps": [reference_map], "wtSequence": sequence,} + scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "creationDate": "2022-02-02", + "publishedDate": "2022-02-02", + "modificationDate": "2022-02-02", + "createdBy": user, + "modifiedBy": user, + #"urn": "urn", + "dataUsagePolicy": "policy", + "licenceId": 0, + "replacesId": 0, + "keywords": ["string"], + "numVariants": 0, + "experiment": experiment, + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "targetGene": target, + "datasetColumns": {}, + "private": True, + } + ScoreSet.parse_obj(scoreset) From eefa4df878c4fc4b4d17510742912b18bc961103 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:47:27 -0700 Subject: [PATCH 506/877] edit imports --- mavecore/models/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py index 3130833..6edff14 100644 --- a/mavecore/models/genome.py +++ b/mavecore/models/genome.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator from datetime import datetime from typing import Optional From 448e76116ba6bbcb0c5d25e60455989cd3146dc0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:47:38 -0700 Subject: [PATCH 507/877] mark attribute as optional --- mavecore/models/genome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py index 6edff14..f2a1b65 100644 --- a/mavecore/models/genome.py +++ b/mavecore/models/genome.py @@ -7,6 +7,6 @@ class Genome(BaseModel): shortName: str organismName: str genomeId: int - creationDate: datetime + creationDate: Optional[datetime] modificationDate: Optional[datetime] id: int From 84046847dcac34180d2f68ce8301fcf817c6fcb5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:47:52 -0700 Subject: [PATCH 508/877] validate date --- mavecore/models/genome.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py index f2a1b65..4aaf338 100644 --- a/mavecore/models/genome.py +++ b/mavecore/models/genome.py @@ -10,3 +10,10 @@ class Genome(BaseModel): creationDate: Optional[datetime] modificationDate: Optional[datetime] id: int + + @validator('creationDate', 'modificationDate') + def date_must_match_regex(cls, v): + # regular expression for validating a date + regex = '%Y-%m-%d' + if not bool(datetime.strptime(v, regex)): + raise ValidationError("{}'s is not a valid date.".format(v)) From 6594fc4936fe575a44a3d1f48f5cfd04a3c40d02 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:48:06 -0700 Subject: [PATCH 509/877] write test cases for Genome model --- tests/test_model_validation/genome.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_model_validation/genome.py b/tests/test_model_validation/genome.py index e69de29..9c2be1d 100644 --- a/tests/test_model_validation/genome.py +++ b/tests/test_model_validation/genome.py @@ -0,0 +1,15 @@ +from unittest import TestCase +from mavecore.models.user import User +from mavecore.models.genome import Genome + + +class TestGenome(TestCase): + def test_valid(self): + genome = { + "shortName": "name", + "organismName": "organism", + "genomeId": 0, + "id": 0, + } + Genome.parse_obj(genome) + From 152cf43bf7ca09e9f960dc35c8f8c1d186070ca2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:48:17 -0700 Subject: [PATCH 510/877] mark attribute as optional --- mavecore/models/identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index bb9f3b3..3972020 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -4,7 +4,7 @@ class Identifier(BaseModel): identifier: str - id: Optional[0] + id: Optional[int] url: Optional[HttpUrl] From 298bc89d0dcf70645242f3d945e1a30453f2d6a5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:48:32 -0700 Subject: [PATCH 511/877] write unittests for Identifier model --- tests/test_model_validation/identifier.py | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index e69de29..485fbdb 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -0,0 +1,54 @@ +from unittest import TestCase +from mavecore.models.identifier import Identifier, DoiIdentifier, PubmedIdentifier + + +class TestIdentifier(TestCase): + def test_valid_all_fields(self): + identifier = { + "identifier": "id", + "id": 0, + "url": "https://www.uw.edu", + } + Identifier.parse_obj(identifier) + + def test_valid_exclude_optional(self): + identifier = { + "identifier": "id", + } + Identifier.parse_obj(identifier) + + def test_invalid_url(self): + identifier = { + "identifier": "id", + "id": 0, + "url": "www.uw.edu", + } + with self.assertRaises(ValueError): + Identifier.parse_obj(identifier) + + +class TestDoiIdentifier(TestCase): + def test_valid_all_fields(self): + doi_identifier = { + "identifier": "id", + "id": 0, + "url": "https://www.uw.edu", + } + DoiIdentifier.parse_obj(doi_identifier) + + +class TestPubmedIdentifier(TestCase): + def test_valid_all_fields(self): + pubmed_identifier = { + "identifier": "id", + "id": 0, + "url": "https://www.uw.edu", + "referenceHtml": "referencehtml", + } + PubmedIdentifier.parse_obj(pubmed_identifier) + + def test_valid_exclude_optional(self): + pubmed_identifier = { + "identifier": "id", + } + PubmedIdentifier.parse_obj(pubmed_identifier) From c4b6cf33a69b1bbbcccc5d501c23b563b96cfdbd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:48:41 -0700 Subject: [PATCH 512/877] edit imports --- mavecore/models/map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index 98e420e..c92850e 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator from datetime import datetime from typing import Optional From 690890856f0ddaeaa5c2f093a84207107128092c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:49:03 -0700 Subject: [PATCH 513/877] mark attributes as optional, validate dates --- mavecore/models/map.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index c92850e..b3d3c7e 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -11,5 +11,12 @@ class ReferenceMap(BaseModel): targetId: int isPrimary: bool genome: Genome - creationDate: datetime - modificationDate: Optional[datetime] \ No newline at end of file + creationDate: Optional[str] + modificationDate: Optional[str] + + @validator('creationDate', 'modificationDate') + def date_must_match_regex(cls, v): + # regular expression for validating a date + regex = '%Y-%m-%d' + if not bool(datetime.strptime(v, regex)): + raise ValidationError("{}'s is not a valid date.".format(v)) From ac0e1265fc015b7f7f546c487d49ed10d8b2cda4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:49:18 -0700 Subject: [PATCH 514/877] write unittests for Map model --- tests/test_model_validation/map.py | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_model_validation/map.py b/tests/test_model_validation/map.py index e69de29..6062b9d 100644 --- a/tests/test_model_validation/map.py +++ b/tests/test_model_validation/map.py @@ -0,0 +1,42 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.map import ReferenceMap + + +class TestReferenceMap(TestCase): + def test_valid_all_fields(self): + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": True, + "genome": genome, + "creationDate": "2022-02-02", + "modificationDate": "2022-02-02", + } + ReferenceMap.parse_obj(reference_map) + + def test_valid_exclude_optional(self): + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": True, + "genome": genome, + } + ReferenceMap.parse_obj(reference_map) + + def test_invalid_creation_date(self): + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = { + "id": 0, + "genomeId": 0, + "targetId": 0, + "isPrimary": True, + "genome": genome, + "creationDate": "2022-02-02-", + } + with self.assertRaises(ValidationError): + ReferenceMap.parse_obj(reference_map) From 43a80c0500b84d25f36e7509b88cd3257cb0fa09 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:49:25 -0700 Subject: [PATCH 515/877] edit imports --- mavecore/models/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index 87c31e8..a193461 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator class WildType(BaseModel): From b9a63acae6bbfa60decce6c95dcf68cc045eb1c8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:49:44 -0700 Subject: [PATCH 516/877] validate sequence type category --- mavecore/models/sequence.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index a193461..cf43f9f 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -4,3 +4,10 @@ class WildType(BaseModel): sequenceType: str sequence: str + + @validator('sequenceType') + def validate_category(cls, v): + valid_sequence_types = ["Infer", "DNA", "Protein"] + if v not in valid_sequence_types: + raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " + "Infer, DNA, and Protein".format(v)) From 8d7f198ca56e00d93725e4ab2ff8560d68960197 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:49:58 -0700 Subject: [PATCH 517/877] write unittests for Sequence model --- tests/test_model_validation/sequence.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_model_validation/sequence.py b/tests/test_model_validation/sequence.py index e69de29..91fc9d2 100644 --- a/tests/test_model_validation/sequence.py +++ b/tests/test_model_validation/sequence.py @@ -0,0 +1,20 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.sequence import WildType + + +class Test(TestCase): + def test_valid_all_fields(self): + sequence = { + "sequenceType": "Protein", + "sequence": "ATCG", + } + WildType.parse_obj(sequence) + + def test_invalid_sequence_type(self): + sequence = { + "sequenceType": "RNA", + "sequence": "ATCG", + } + with self.assertRaises(ValidationError): + WildType.parse_obj(sequence) From 69f3a2b0fb46ea2e81fa9e57546dfacac7579f04 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:50:05 -0700 Subject: [PATCH 518/877] edit imports --- mavecore/models/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index e8b6ada..bc48771 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError, validator from typing import List from .map import ReferenceMap From f004a4836e730b007cfda29e37a019b7a9a5fdc3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:50:17 -0700 Subject: [PATCH 519/877] validate target gene category --- mavecore/models/target.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index bc48771..71e2d61 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -10,3 +10,10 @@ class TargetGene(BaseModel): category: str referenceMaps: List[ReferenceMap] wtSequence: WildType + + @validator('category') + def validate_category(cls, v): + valid_categories = ["Protein coding", "Regulatory", "Other noncoding"] + if v not in valid_categories: + raise ValidationError("{}'s is not a valid target category. Valid categories are " + "Protein coding, Regulatory, and Other noncoding".format(v)) From 479e69514611b1998d86aea75dffa5d459ed21e9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:50:28 -0700 Subject: [PATCH 520/877] write unittests for Target model --- tests/test_model_validation/target.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_model_validation/target.py b/tests/test_model_validation/target.py index e69de29..163f787 100644 --- a/tests/test_model_validation/target.py +++ b/tests/test_model_validation/target.py @@ -0,0 +1,41 @@ +from unittest import TestCase +from pydantic import ValidationError +from mavecore.models.target import TargetGene + + +class TestTargetGene(TestCase): + def test_valid_all_fields(self): + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} + sequence = {"sequenceType": "type", "sequence": "ATCG"} + target = { + "name": "name", + "category": "Protein coding", + "referenceMaps": [reference_map], + "wtSequence": sequence, + } + TargetGene.parse_obj(target) + + def test_invalid_category(self): + genome = {"shortName": "names", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} + sequence = {"sequenceType": "type", "sequence": "ATCG"} + target = { + "name": "name", + "category": "Protein", + "referenceMaps": [reference_map], + "wtSequence": sequence, + } + with self.assertRaises(ValidationError): + TargetGene.parse_obj(target) + + def test_invalid_missing_required_field(self): + genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} + reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} + target = { + "name": "name", + "category": "Protein coding", + "referenceMaps": [reference_map], + } + with self.assertRaises(ValidationError): + TargetGene.parse_obj(target) From bd157c5498c90bff361d8598f723369679682338 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:50:37 -0700 Subject: [PATCH 521/877] change error type --- mavecore/models/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/user.py b/mavecore/models/user.py index 97b6932..ca71f4f 100644 --- a/mavecore/models/user.py +++ b/mavecore/models/user.py @@ -13,4 +13,4 @@ def check_email_has_valid_structure(cls, v): # regular expression for validating an Email regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' if not (re.fullmatch(regex, v)): - raise ValueError("{}'s is not a valid email.".format(v)) + raise ValidationError("{}'s is not a valid email.".format(v)) From 158be3fb7efada003de9cb5c9039cd44b65013ea Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:50:46 -0700 Subject: [PATCH 522/877] edit imports --- tests/test_model_validation/user.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_model_validation/user.py b/tests/test_model_validation/user.py index 7731aa8..61aed76 100644 --- a/tests/test_model_validation/user.py +++ b/tests/test_model_validation/user.py @@ -1,4 +1,5 @@ from unittest import TestCase +from pydantic import ValidationError from mavecore.models.user import User From a689781d9c9590eb46d221df4f98c7e99ffe6f38 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:51:35 -0700 Subject: [PATCH 523/877] instantiate unittest model for User --- tests/test_model_validation/user.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model_validation/user.py b/tests/test_model_validation/user.py index 61aed76..6daea4a 100644 --- a/tests/test_model_validation/user.py +++ b/tests/test_model_validation/user.py @@ -3,8 +3,8 @@ from mavecore.models.user import User -class Test(TestCase): - def test_valid(self): +class TestUser(TestCase): + def test_valid_all_fields(self): user = { "orcid_id": "idididid", "firstName": "first", From 5988c5b24ccf8a3912343b6f041baa1995dc16d2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:51:47 -0700 Subject: [PATCH 524/877] write user fields for test case --- tests/test_model_validation/user.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model_validation/user.py b/tests/test_model_validation/user.py index 6daea4a..5e9f8da 100644 --- a/tests/test_model_validation/user.py +++ b/tests/test_model_validation/user.py @@ -6,10 +6,10 @@ class TestUser(TestCase): def test_valid_all_fields(self): user = { - "orcid_id": "idididid", - "firstName": "first", - "lastName": "last", - "email": "firstlast@email.edu", + "orcid_id": "idididid", + "firstName": "first", + "lastName": "last", + "email": "firstlast@email.edu", } User.parse_obj(user) From 003db4030d9006ec21b2744813d41aeacc006f49 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 12 Aug 2022 18:51:58 -0700 Subject: [PATCH 525/877] edit error type --- tests/test_model_validation/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_validation/user.py b/tests/test_model_validation/user.py index 5e9f8da..61d5943 100644 --- a/tests/test_model_validation/user.py +++ b/tests/test_model_validation/user.py @@ -20,5 +20,5 @@ def test_invalid_email(self): "lastName": "last", "email": "firstlastemail.edu", } - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): User.parse_obj(user) From 5ee3021f67dd957c42278d033c50887ec4b386d8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 16:14:31 -0700 Subject: [PATCH 526/877] edit imports --- mavecore/models/identifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 3972020..daefa0b 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -1,5 +1,6 @@ -from pydantic import BaseModel, HttpUrl +from pydantic import BaseModel, ValidationError, validator, HttpUrl from typing import Optional +import idutils class Identifier(BaseModel): From f4f55e3e1dc4ca1dc3baa335d89ff28acde851d5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 16:14:40 -0700 Subject: [PATCH 527/877] validate DOI --- mavecore/models/identifier.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index daefa0b..cf7305f 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -10,7 +10,11 @@ class Identifier(BaseModel): class DoiIdentifier(Identifier): - pass + + @validator('identifier') + def must_match_regular_expression(cls, v): + if not idutils.is_doi(v): + raise ValidationError("{} is not a valid DOI identifier.".format(v)) class PubmedIdentifier(Identifier): From 92e75752e892be42e0052f05fdb4fbb1444f590e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 16:14:49 -0700 Subject: [PATCH 528/877] validate PubMed --- mavecore/models/identifier.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index cf7305f..8a39297 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -19,3 +19,8 @@ def must_match_regular_expression(cls, v): class PubmedIdentifier(Identifier): referenceHtml: Optional[str] + + @validator('identifier') + def must_match_regular_expression(cls, v): + if not idutils.is_pmid(v): + raise ValidationError("{} is not a valid PubMed identifier.".format(v)) From abbfb70f2ff9491874fc2dca9736cfbb9ba0ca24 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:22:25 -0700 Subject: [PATCH 529/877] rename methods --- mavecore/models/identifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 8a39297..3eae8f0 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -12,7 +12,7 @@ class Identifier(BaseModel): class DoiIdentifier(Identifier): @validator('identifier') - def must_match_regular_expression(cls, v): + def must_be_valid_doi(cls, v): if not idutils.is_doi(v): raise ValidationError("{} is not a valid DOI identifier.".format(v)) @@ -21,6 +21,6 @@ class PubmedIdentifier(Identifier): referenceHtml: Optional[str] @validator('identifier') - def must_match_regular_expression(cls, v): + def must_be_valid_pubmed(cls, v): if not idutils.is_pmid(v): raise ValidationError("{} is not a valid PubMed identifier.".format(v)) From 3dbfb9645b2063d38ce25ae3defa177634371f55 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:22:49 -0700 Subject: [PATCH 530/877] make identifier attributes valid --- tests/test_model_validation/identifier.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index 485fbdb..df5cdfa 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -5,7 +5,7 @@ class TestIdentifier(TestCase): def test_valid_all_fields(self): identifier = { - "identifier": "id", + "identifier": "10.1038/s41588-018-0122-z", "id": 0, "url": "https://www.uw.edu", } @@ -13,13 +13,13 @@ def test_valid_all_fields(self): def test_valid_exclude_optional(self): identifier = { - "identifier": "id", + "identifier": "29785012", } Identifier.parse_obj(identifier) def test_invalid_url(self): identifier = { - "identifier": "id", + "identifier": "29785012", "id": 0, "url": "www.uw.edu", } @@ -30,7 +30,7 @@ def test_invalid_url(self): class TestDoiIdentifier(TestCase): def test_valid_all_fields(self): doi_identifier = { - "identifier": "id", + "identifier": "10.1038/s41588-018-0122-z", "id": 0, "url": "https://www.uw.edu", } @@ -40,7 +40,7 @@ def test_valid_all_fields(self): class TestPubmedIdentifier(TestCase): def test_valid_all_fields(self): pubmed_identifier = { - "identifier": "id", + "identifier": "29785012", "id": 0, "url": "https://www.uw.edu", "referenceHtml": "referencehtml", From 323bc9b90147e13d0a4ac34900dbb7667f750bde Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:23:14 -0700 Subject: [PATCH 531/877] test for invalid identifier types --- tests/test_model_validation/identifier.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index df5cdfa..5ff343e 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -36,6 +36,13 @@ def test_valid_all_fields(self): } DoiIdentifier.parse_obj(doi_identifier) + def test_invalid_type_of_identifier(self): + identifier = { + "identifier": "29785012", + } + with self.assertRaises(ValueError): + DoiIdentifier.parse_obj(identifier) + class TestPubmedIdentifier(TestCase): def test_valid_all_fields(self): @@ -52,3 +59,10 @@ def test_valid_exclude_optional(self): "identifier": "id", } PubmedIdentifier.parse_obj(pubmed_identifier) + + def test_invalid_type_of_identifier(self): + identifier = { + "identifier": "10.1038/s41588-018-0122-z", + } + with self.assertRaises(ValueError): + PubmedIdentifier.parse_obj(identifier) From 598d5cd2244813ebcff75a221dffa78d9b92ea41 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:23:29 -0700 Subject: [PATCH 532/877] make identifier attribute valid --- tests/test_model_validation/identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index 5ff343e..686aa7e 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -56,7 +56,7 @@ def test_valid_all_fields(self): def test_valid_exclude_optional(self): pubmed_identifier = { - "identifier": "id", + "identifier": "29785012", } PubmedIdentifier.parse_obj(pubmed_identifier) From c346d2d19943ea0abab7472a6015d03fec64e6a1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:45:11 -0700 Subject: [PATCH 533/877] import constants --- mavecore/models/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 5c4cd5f..c291d61 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -7,6 +7,7 @@ from .target import TargetGene from mavecore.validation_new.constants.urn import * +from mavecore.validation.utilities import is_null class DataSet(BaseModel): From 694e43617a78ef6a3521bcd3043624d044bd3772 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:45:29 -0700 Subject: [PATCH 534/877] rename methods --- mavecore/models/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index c291d61..3e82107 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -40,7 +40,7 @@ class Experiment(DataSet): processingState: Optional[str] @validator('urn') - def urn_must_match_regex(cls, v): + def validate_urn_matches_regex(cls, v): regex = MAVEDB_TMP_URN_RE if not (re.fullmatch(regex, v)): raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) @@ -48,7 +48,7 @@ def urn_must_match_regex(cls, v): # raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) @validator('experimentSetUrn') - def experiment_set_urn_must_match_regex(cls, v): + def validate_experiment_set_urn_matches_regex(cls, v): if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) @@ -60,7 +60,7 @@ class ExperimentSet(DataSet): numExperiments: int @validator('urn') - def must_match_regular_expression(cls, v): + def validate_matches_regular_expression(cls, v): if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) @@ -80,6 +80,6 @@ class ScoreSet(DataSet): private: bool @validator('urn') - def must_match_regular_expression(cls, v): + def validate_matches_regular_expression(cls, v): if not (MAVEDB_SCORESET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): raise ValidationError("{}'s is not a valid score set urn.".format(v)) From 119d921e391c0260550b4eea6320965cfc27fee0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:45:48 -0700 Subject: [PATCH 535/877] write method to validate keywords --- mavecore/models/data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 3e82107..39f086b 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -47,6 +47,15 @@ def validate_urn_matches_regex(cls, v): #if not (MAVEDB_EXPERIMENTSET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): # raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) + @validator('keywords') + def validate_keywords(cls, v): + if is_null(v): + raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) + else: + for keyword in v: + if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) + @validator('experimentSetUrn') def validate_experiment_set_urn_matches_regex(cls, v): if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): From bd0dc39dbeb565e82bb063ac4c3dc529e75452e9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:46:12 -0700 Subject: [PATCH 536/877] update identifier attributes to valid values --- tests/test_model_validation/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index b8f484a..445136f 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -68,8 +68,8 @@ def test_invalid_modification_date(self): class TestExperiment(TestCase): def test_valid_all_fields(self): user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} - doi_identifier = {"identifier": "id"} - pubmed_identifier = {"identifier": "id"} + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} experiment = { "title": "title", "shortDescription": "short description", @@ -128,8 +128,8 @@ class TestScoreSet(TestCase): def test_valid_all_fields(self): user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} - doi_identifier = {"identifier": "id"} - pubmed_identifier = {"identifier": "id"} + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} sequence = {"sequenceType": "DNA", "sequence": "ATCG"} From 9633df473120fcb624fc96aebb0f033ac63aa697 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:46:26 -0700 Subject: [PATCH 537/877] comment out urn attribute --- tests/test_model_validation/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index 445136f..ff6f416 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -81,7 +81,7 @@ def test_valid_all_fields(self): "modificationDate": "2022-02-02", "createdBy": user, "modifiedBy": user, - "urn": "tmp:070b3886-ed72-4ce9-a574-6754ad00310b", + #"urn": "tmp:070b3886-ed72-4ce9-a574-6754ad00310b", "keywords": ["string"], "numScoresets": 0, #"experimentSetUrn": "urn", From 5e570861c8e28e9b5c1210605f2f604a7e77d3eb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:50:00 -0700 Subject: [PATCH 538/877] test invalid keywords --- tests/test_model_validation/data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index ff6f416..2ac41b5 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -100,6 +100,17 @@ def test_valid_exclude_optional(self): } Experiment.parse_obj(experiment) + def test_invalid_keywords(self): + experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "keywords": ["null"], + } + with self.assertRaises(ValidationError): + Experiment.parse_obj(experiment) + class TestExperimentSet(TestCase): def test_valid(self): From ae4a9ff8f8b5159a9e90f69babc7d2862e1d7ac8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:50:13 -0700 Subject: [PATCH 539/877] import ValidationError from pydantic --- tests/test_model_validation/identifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index 686aa7e..37207ef 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -1,4 +1,5 @@ from unittest import TestCase +from pydantic import ValidationError from mavecore.models.identifier import Identifier, DoiIdentifier, PubmedIdentifier From 922763b625691bfae9af14301f6c9e79e90671ff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 19 Aug 2022 17:50:26 -0700 Subject: [PATCH 540/877] change error type --- tests/test_model_validation/identifier.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_model_validation/identifier.py b/tests/test_model_validation/identifier.py index 37207ef..93dc885 100644 --- a/tests/test_model_validation/identifier.py +++ b/tests/test_model_validation/identifier.py @@ -24,7 +24,7 @@ def test_invalid_url(self): "id": 0, "url": "www.uw.edu", } - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): Identifier.parse_obj(identifier) @@ -41,7 +41,7 @@ def test_invalid_type_of_identifier(self): identifier = { "identifier": "29785012", } - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): DoiIdentifier.parse_obj(identifier) @@ -65,5 +65,5 @@ def test_invalid_type_of_identifier(self): identifier = { "identifier": "10.1038/s41588-018-0122-z", } - with self.assertRaises(ValueError): + with self.assertRaises(ValidationError): PubmedIdentifier.parse_obj(identifier) From 2a28f856bca6c5017df934f1dc8ba85bdbfe50a6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:28:43 -0700 Subject: [PATCH 541/877] delete unneeded files --- mavecore/validation_new/experiment.py | 81 ----------------- mavecore/validation_new/general.py | 73 --------------- mavecore/validation_new/identifiers.py | 46 ---------- mavecore/validation_new/metadata.py | 15 ---- mavecore/validation_new/scoreset.py | 89 ------------------- mavecore/validation_new/summary.py | 117 ------------------------- mavecore/validation_new/target.py | 78 ----------------- mavecore/validation_new/type.py | 25 ------ mavecore/validation_new/urn.py | 97 -------------------- mavecore/validation_new/user.py | 97 -------------------- 10 files changed, 718 deletions(-) delete mode 100644 mavecore/validation_new/experiment.py delete mode 100644 mavecore/validation_new/general.py delete mode 100644 mavecore/validation_new/identifiers.py delete mode 100644 mavecore/validation_new/metadata.py delete mode 100644 mavecore/validation_new/scoreset.py delete mode 100644 mavecore/validation_new/summary.py delete mode 100644 mavecore/validation_new/target.py delete mode 100644 mavecore/validation_new/type.py delete mode 100644 mavecore/validation_new/urn.py delete mode 100644 mavecore/validation_new/user.py diff --git a/mavecore/validation_new/experiment.py b/mavecore/validation_new/experiment.py deleted file mode 100644 index 06ca2ca..0000000 --- a/mavecore/validation_new/experiment.py +++ /dev/null @@ -1,81 +0,0 @@ -from type import * -import summary, metadata, urn, user, general, identifiers - - -def validate_experiment(experiment): - """ - This function validates an experiment. - - Parameters: - __________ - experiment: dict - The experiment represented as a dictionary. - - Raises: - ______ - ValidationError - If the experiment is not a dictionary or if any key:value pair in the experiment is not valid. - """ - # check type - is_dictionary(experiment) - # "experiment": { - # "title": "string", - # "shortDescription": "string", - # "abstractText": "string", - # "methodText": "string", - summary.validate_title(experiment.get("title")) - summary.validate_short_description(experiment.get("shortDescription")) - summary.validate_abstract(experiment.get("abstractText")) - summary.validate_methods(experiment.get("methodText")) - # "extraMetadata": {}, - metadata.validate_metadata(experiment.get("extraMetadata")) - # "keywords": [ - # "string" - # ], - summary.validate_keywords(experiment.get("keywords")) - # "urn": "string", - urn.validate_experiment_urn(experiment.get("urn")) - # "numScoresets": 0, - summary.validate_num_scoresets(experiment.get("numScoresets")) - # "createdBy": { - # "orcid_id": "string", - # "firstName": "string", - # "lastName": "string", - # "email": "string" - # }, - # "modifiedBy": { - # "orcid_id": "string", - # "firstName": "string", - # "lastName": "string", - # "email": "string" - # }, - user.validate_user(experiment.get("createdBy")) - user.validate_user(experiment.get("modifiedBy")) - # "creationDate": "2022-08-02", - # "modificationDate": "2022-08-02", - # "publishedDate": "2022-08-02", - general.validate_date(experiment.get("creationDate")) - general.validate_date(experiment.get("modificationDate")) - general.validate_date(experiment.get("publishedDate")) - # "experimentSetUrn": "string", - urn.validate_experimentset_urn(experiment.get("experimentSetUrn")) - # "doiIdentifiers": [ - # { - # "identifier": "string", - # "id": 0, - # "url": "string" - # } - # ], - # "pubmedIdentifiers": [ - # { - # "identifier": "string", - # "id": 0, - # "url": "string", - # "referenceHtml": "string" - # } - # ], - identifiers.validate_doi_identifiers(experiment.get("doiIdentifiers")) - identifiers.validate_pubmed_identifiers(experiment.get("pubmedIdentifiers")) - # "processingState": "string" - general.validate_processing_state(experiment.get("processingState")) - # }, \ No newline at end of file diff --git a/mavecore/validation_new/general.py b/mavecore/validation_new/general.py deleted file mode 100644 index becaeb7..0000000 --- a/mavecore/validation_new/general.py +++ /dev/null @@ -1,73 +0,0 @@ -from exceptions import ValidationError -import datetime - - -def validate_data_usage_policy(dataUsagePolicy): - """ - Validates - :param dataUsagePolicy: - :return: - """ - - -def validate_license_id(licenseId): - """ - - :param licenseId: - :return: - """ - - -def validate_replaces_id(replacesId): - """ - - :param replacesId: - :return: - """ - - -def validate_processing_state(processingState): - """ - - :param processingState: - :return: - """ - return - - -def validate_date(date): - """ - Validates a date such as creation date, modification date and published date. - - Parameters: - __________ - date: str - The date to be validated. - - Raises: - ______ - ValidationError - If the date is in the wrong format. - """ - template = '%Y-%m-%d' - try: - datetime.datetime.strptime(date, template) - except ValidationError: - print("Date should be formatted as YYYY-MM-DD") - - -def validate_private(private): - """ - Validate private attribute. - - Parameters: - __________ - private: bool - The boolean private attribute to be validated. - - Raises: - ______ - ValidationError - If the private attribute is not a bool. - """ - if type(private) != bool: raise ValidationError("The private attribute should be of type boolean.") \ No newline at end of file diff --git a/mavecore/validation_new/identifiers.py b/mavecore/validation_new/identifiers.py deleted file mode 100644 index 3173c49..0000000 --- a/mavecore/validation_new/identifiers.py +++ /dev/null @@ -1,46 +0,0 @@ -def validate_doi_identifiers(doiIdentifiers): - """ - This function validates a list of DOI identifiers represented as dictionaries. - - Parameters: - __________ - doiIdentifiers: list[dict] - The DOI identifiers that need to be validated - - Raises: - ______ - ValidationError - If any identifier is found to be invalid or if doiIdentifiers is not a list of dictionaries. - """ - return -#"doiIdentifiers": [ - # { - # "identifier": "string", - # "id": 0, - # "url": "string" - # } - # ], - - -def validate_pubmed_identifiers(pubmedIdentifiers): - """ - This function validates a list of PubMed identifiers represented as dictionaries. - - Parameters: - __________ - pubmedIdentifiers: list[dict] - The PubMed identifiers that need to be validated - - Raises: - ______ - ValidationError - If any identifier is found to be invalid or if pubMed Identifiers is not a list of dictionaries. - """ - #"pubmedIdentifiers": [ - # { - # "identifier": "string", - # "id": 0, - # "url": "string", - # "referenceHtml": "string" - #} - return \ No newline at end of file diff --git a/mavecore/validation_new/metadata.py b/mavecore/validation_new/metadata.py deleted file mode 100644 index 981bf62..0000000 --- a/mavecore/validation_new/metadata.py +++ /dev/null @@ -1,15 +0,0 @@ -def validate_metadata(extraMetadata): - """ - This function validates metadata associated with an upload. - - Parameters: - __________ - extraMetadata: Dict - The metadata to be validated. - - Raises: - ______ - ValidationError - If any of the key:value pairs are not valid. - """ - #TODO \ No newline at end of file diff --git a/mavecore/validation_new/scoreset.py b/mavecore/validation_new/scoreset.py deleted file mode 100644 index be7584d..0000000 --- a/mavecore/validation_new/scoreset.py +++ /dev/null @@ -1,89 +0,0 @@ -from exceptions import ValidationError -from type import * -import urn, summary, metadata, general, experiment, identifiers, user, target - - -def validate_scoreset(scoreset, files): - """ - Validates a scoreset represented as a dictionary. - - Parameters: - __________ - scoreset: Dict - The scoreset that will be validated. - files: path - The path to the files belonging to the scoreset. - - Raises: - ______ - ValidationError - If scoreset is not a dictionary or if any additional validation fails. - """ - # first validate that scoreset is a dictionary - is_dictionary(scoreset) - # { - # "urn": "string", - # "title": "string", - # "methodText": "string", - # "abstractText": "string", - # "shortDescription": "string", - urn.validate_scoreset_urn(scoreset.get("urn")) - summary.validate_title(scoreset.get("title")) - summary.validate_methods(scoreset.get("methodText")) - summary.validate_abstract(scoreset.get("abstractText")) - summary.validate_short_description(scoreset.get("shortDescription")) - # "extraMetadata": {}, - metadata.validate_metadata(scoreset.get("extraMetadata")) - # "dataUsagePolicy": "string", - # "licenceId": 0, - # "replacesId": 0, - general.validate_data_usage_policy(scoreset.get("dataUsagePolicy")) - general.validate_license_id(scoreset.get("licenseId")) - general.validate_replaces_id(scoreset.get("replacesId")) - # "keywords": [ - # "string" - # ], - summary.validate_keywords(scoreset.get("keywords")) - # "numVariants": 0, - summary.validate_num_variants(scoreset.get("numVariants")) - # "experiment": { - # }, - experiment.validate_experiemnt(scoreset.get("experiment")) - # "doiIdentifiers": [ - # { - # } - # ], - # "pubmedIdentifiers": [ - # { - # } - # ], - identifiers.validate_doi_identifiers(scoreset.get("doiIdentifiers")) - identifiers.validate_pubmed_identifiers(scoreset.get("pubmedIdentifiers")) - # "publishedDate": "2022-08-02", - # "creationDate": "2022-08-02", - # "modificationDate": "2022-08-02", - general.validate_date(scoreset.get("publishedDate")) - general.validate_date(scoreset.get("creationDate")) - general.validate_date(scoreset.get("modificationDate")) - # "createdBy": { - # "orcid_id": "string", - # "firstName": "string", - # "lastName": "string", - # "email": "string" - # }, - # "modifiedBy": { - # "orcid_id": "string", - # "firstName": "string", - # "lastName": "string", - # "email": "string" - # }, - user.validate_user(scoreset.get("createdBy")) - user.validate_user(scoreset.get("modifiedBy")) - # "targetGene": { - # }, - target.validate_target_gene(scoreset.get("targetGene")) - # "datasetColumns": {}, - summary.validate_dataset_columns(scoreset.get("datasetColumns")) - # "private": true - general.validate_private(scoreset.get("private")) - # } diff --git a/mavecore/validation_new/summary.py b/mavecore/validation_new/summary.py deleted file mode 100644 index 238cfa4..0000000 --- a/mavecore/validation_new/summary.py +++ /dev/null @@ -1,117 +0,0 @@ -from exceptions import ValidationError - - -def validate_title(title): - """ - Validates a title of an experiment set, an experiment, or a scoreset. - - Parameters: - __________ - title: str - The title to be validated. - - Raises: - ______ - ValidationError - If the title is not valid. - """ - # check if title is a string - if type(title) != str: raise ValidationError("The title must be a string.") - - # check that title is not too long - - -def validate_short_description(shortDescription): - """ - Validates the short description of an experiment set, an experiment, or a scoreset. - - Parameters: - __________ - shortDescription: str - The short description to be validated. - - Raises: - ______ - ValidationError - If the short description is too long or is not a string. - """ - # check if short description is a string - if type(shortDescription) != str: raise ValidationError("The short description must be a string.") - - # check if short description is too long - count = len(shortDescription.split(" ")) - if count > 50: raise ValidationError("The short description must be less than or equal to 50 words.") - - -def validate_abstract(abstractText): - """ - Validates the abstract of an experiment set, an experiment, or a scoreset. - - Parameters: - __________ - abstractText: str - The abstract to be validated. - - Raises: - ______ - ValidationError - If the abstract is too long or is not a string. - """ - # check if short description is a string - if type(abstractText) != str: raise ValidationError("The abstract must be a string.") - - # check if short description is too long - count = len(abstractText.split(" ")) - if count > 200: raise ValidationError("The abstract must be less than or equal to 200 words.") - - -def validate_methods(methodText): - """ - Validates the methods of an experiment set, an experiment, or a scoreset. - - Parameters: - __________ - methodText: str - The methods to be validated. - - Raises: - ______ - ValidationError - If the methods are too long or is not a string. - """ - # check if short description is a string - if type(methodText) != str: raise ValidationError("The methods must be a string.") - - # check if short description is too long - count = len(methodText.split(" ")) - if count > 200: raise ValidationError("The methods must be less than or equal to 200 words.") - - -def validate_keywords(keywords): - """ - Validates the methods of an experiment set, an experiment, or a scoreset. - - Parameters: - __________ - methodText: str - The methods to be validated. - - Raises: - ______ - ValidationError - If the keywords object is not a list of strings. - """ - # check keywords type - if type(keywords) != list[str]: raise ValidationError("The keywords must be a list of strings.") - - -def validate_num_scoresets(numScoresets): - return - - -def validate_num_variants(numVariants): - return - - -def validate_dataset_columns(datasetColumns): - return diff --git a/mavecore/validation_new/target.py b/mavecore/validation_new/target.py deleted file mode 100644 index 5ee0184..0000000 --- a/mavecore/validation_new/target.py +++ /dev/null @@ -1,78 +0,0 @@ -from exceptions import ValidationError -from type import * - - -def validate_target_gene(targetGene): - """ - Validates target gene represented as a dictionary. - - Parameters: - __________ - targetGene: dict - The target gene to be validated - - Raises: - ______ - ValidationError - If the target gene is not represented as a dictionary or if any of the key value pairs are invalid. - """ - is_dictionary(targetGene) - # "targetGene": { - # "name": "string", - # "category": "string", - # "referenceMaps": [ - # ], - # "wtSequence": { - # "sequenceType": "string", - # "sequence": "string" - # } - # }, - - -def validate_name(name): - is_string(name) - - -def validate_category(category): - is_string(category) - - -def validate_reference_maps(referenceMaps): - """ - Validates reference maps for the target gene. - - Parameters: - __________ - referenceMaps: list[dict] - The list of reference maps to be validated - - Raises: - ______ - ValidationError - If the referenceMaps are not a list of dictionaries - or if any of the key value pairs in the dictionary are invalid - """ - is_list(referenceMaps) - is_integer(referenceMaps.get("id")) - is_integer(referenceMaps.get("genomeId")) - -# { -# "id": 0, -# "genomeId": 0, -# "targetId": 0, -# "isPrimary": true, -# "genome": { -# "shortName": "string", -# "organismName": "string", -# "genomeId": 0, -# "creationDate": "2022-08-02", -# "modificationDate": "2022-08-02", -# "id": 0 -# }, -# "creationDate": "2022-08-02", -# "modificationDate": "2022-08-02" -# } - - -def validate_wt_sequence(wtSequence): - return \ No newline at end of file diff --git a/mavecore/validation_new/type.py b/mavecore/validation_new/type.py deleted file mode 100644 index 2ad6d15..0000000 --- a/mavecore/validation_new/type.py +++ /dev/null @@ -1,25 +0,0 @@ -from exceptions import ValidationError - - -def is_none(item): - if item is None: raise ValidationError("{} is a required attribute.".format(item)) - - -def is_integer(item): - if type(item) != int: raise ValidationError("{} must be a string.".format(item)) - - -def is_string(item): - if type(item) != item: raise ValidationError("{} must be a string.".format(item)) - - -def is_list(item): - if type(item) != item: raise ValidationError("{} must be a list.".format(item)) - - -def is_dictionary(item): - if type(item) != item: raise ValidationError("{} must be a dictionary.".format(item)) - - -def is_boolean(boolean): - if type(boolean) != boolean: raise ValidationError("{} must be a boolean value.".format(boolean)) \ No newline at end of file diff --git a/mavecore/validation_new/urn.py b/mavecore/validation_new/urn.py deleted file mode 100644 index cf3162e..0000000 --- a/mavecore/validation_new/urn.py +++ /dev/null @@ -1,97 +0,0 @@ -from constants.urn import * -from mavecore.validation.exceptions import ValidationError - - -def validate_urn(urn): - """ - This function validates a MaveDB urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB urn is not valid. - """ - if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError("{}'s is not a valid urn.".format(urn)) - - -def validate_experimentset_urn(urn): - """ - This function validates a Experiment Set urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment Set urn to be validated. - - Raises - ______ - ValidationError - If the Experiment Set urn is not valid. - """ - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - # "Error test" - "{}'s is not a valid Experiment Set urn.".format(urn) - ) - - -def validate_experiment_urn(urn): - """ - This function validates an Experiment urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Experiment urn to be validated. - - Raises - ______ - ValidationError - If the Experiemnt urn is not valid. - """ - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "{}'s is not a valid Experiment urn.".format(urn) - ) - - -def validate_scoreset_urn(urn): - """ - This function validates a Scoreset urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The Scoreset urn to be validated - - Raises - ______ - ValidationError - If the Scoreset urn is not valid. - """ - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError("{}'s is not a valid score set urn.".format(urn)) - - -def validate_variant_urn(urn): - """ - This function validates a MaveDB Variant urn and raises an error if it is not valid. - - Parameters - __________ - urn : str - The MaveDB Variant urn to be validated. - - Raises - ______ - ValidationError - If the MaveDB Variant urn is not valid. - """ - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) diff --git a/mavecore/validation_new/user.py b/mavecore/validation_new/user.py deleted file mode 100644 index deb70ec..0000000 --- a/mavecore/validation_new/user.py +++ /dev/null @@ -1,97 +0,0 @@ -from exceptions import ValidationError -from type import * - - -def validate_user(userId): - """ - This function validates a user ID. - - Parameters: - __________ - id: dict - The user ID as a dictionary of user attributes - - Raises: - ______ - ValidationError - If any of the user attributes are found to be invalid. - """ - # check id type - is_dictionary(userId) - # run additional validation - validate_orcid_id(userId.get("orcid_id")) - validate_first_name(userId.get("first_name")) - validate_last_name(userId.get("lastName")) - validate_email(userId.get("email")) - - -def validate_orcid_id(orcid_id): - """ - Validates ORCID ID. - - Parameters: - __________ - orcid_id: str - The user's ORCID ID. - - Raises: - ______ - ValidationError - If the user's ORCID ID is not valid. - """ - # check type - is_string(orcid_id) - - -def validate_first_name(firstName): - """ - Validates user's first name. - - Parameters: - __________ - firstName: str - The user's first name. - - Raises: - ______ - ValidationError - If the user's first name is not a string. - """ - # check type - is_string(firstName) - - -def validate_last_name(lastName): - """ - Validates user's last name. - - Parameters: - __________ - lastName: str - The user's last name. - - Raises: - ______ - ValidationError - If the user's last name is not a string. - """ - # check type - is_string(lastName) - - -def validate_email(email): - """ - Validates user's email. - - Parameters: - __________ - email: str - The user's email. - - Raises: - ______ - ValidationError - If the user's email is not valid. - """ - # check type - is_string(email) From df67c84f1beb34a076a1e55574c48046a27250d7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:29:03 -0700 Subject: [PATCH 542/877] move files --- .../constants}/__init__.py | 0 mavecore/validation/constants/general.py | 51 +++++++++++++++++++ .../constants/summary.py | 0 .../constants/urn.py | 0 mavecore/validation_new/constants/general.py | 2 - .../constants => validation_old}/__init__.py | 0 .../constants.py | 0 .../dataset_validators.py | 0 .../exceptions.py | 0 .../metadata_validators.py | 0 .../urn_validators.py | 0 .../variant_validators/__init__.py | 0 .../variant_validators/hgvs.py | 0 .../variant_validators/variant.py | 0 14 files changed, 51 insertions(+), 2 deletions(-) rename mavecore/{validation_new => validation/constants}/__init__.py (100%) create mode 100644 mavecore/validation/constants/general.py rename mavecore/{validation_new => validation}/constants/summary.py (100%) rename mavecore/{validation_new => validation}/constants/urn.py (100%) delete mode 100644 mavecore/validation_new/constants/general.py rename mavecore/{validation_new/constants => validation_old}/__init__.py (100%) rename mavecore/{validation => validation_old}/constants.py (100%) rename mavecore/{validation => validation_old}/dataset_validators.py (100%) rename mavecore/{validation_new => validation_old}/exceptions.py (100%) rename mavecore/{validation => validation_old}/metadata_validators.py (100%) rename mavecore/{validation => validation_old}/urn_validators.py (100%) rename mavecore/{validation => validation_old}/variant_validators/__init__.py (100%) rename mavecore/{validation => validation_old}/variant_validators/hgvs.py (100%) rename mavecore/{validation => validation_old}/variant_validators/variant.py (100%) diff --git a/mavecore/validation_new/__init__.py b/mavecore/validation/constants/__init__.py similarity index 100% rename from mavecore/validation_new/__init__.py rename to mavecore/validation/constants/__init__.py diff --git a/mavecore/validation/constants/general.py b/mavecore/validation/constants/general.py new file mode 100644 index 0000000..71e710b --- /dev/null +++ b/mavecore/validation/constants/general.py @@ -0,0 +1,51 @@ +# valid data usage policies + +import re + +""" +Null Constant definitions +""" +NA_STRING = "NA" +null_values_list = ( + "nan", + "na", + "none", + "", + "undefined", + "n/a", + "null", + "nil", +) +# enforce the assumption that these are all lowercase values +null_values_list = [s.lower() for s in null_values_list] +# add the NA_STRING only if it's not already in the list +if NA_STRING.lower() not in null_values_list: + null_values_list.append(NA_STRING.lower()) +null_values_list.sort() + +null_values_re = re.compile( + r"^\s+$|" + "|".join(f"^{s}$" for s in null_values_list if len(s)), + flags=re.IGNORECASE, +) + +readable_null_values_list = [f"'{s}'" for s in null_values_list] + ["whitespace"] + +hgvs_nt_column = "hgvs_nt" +hgvs_splice_column = "hgvs_splice" +hgvs_pro_column = "hgvs_pro" +hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) +meta_data = "meta_data" +score_columns = "score_columns" +count_columns = "count_columns" +variant_score_data = "score_data" +variant_count_data = "count_data" +required_score_column = "score" + +valid_dataset_columns = [score_columns, count_columns] +valid_variant_columns = [variant_score_data, variant_count_data] + +variant_to_scoreset_column = { + variant_score_data: score_columns, + variant_count_data: count_columns, +} +scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} diff --git a/mavecore/validation_new/constants/summary.py b/mavecore/validation/constants/summary.py similarity index 100% rename from mavecore/validation_new/constants/summary.py rename to mavecore/validation/constants/summary.py diff --git a/mavecore/validation_new/constants/urn.py b/mavecore/validation/constants/urn.py similarity index 100% rename from mavecore/validation_new/constants/urn.py rename to mavecore/validation/constants/urn.py diff --git a/mavecore/validation_new/constants/general.py b/mavecore/validation_new/constants/general.py deleted file mode 100644 index 82ff0d7..0000000 --- a/mavecore/validation_new/constants/general.py +++ /dev/null @@ -1,2 +0,0 @@ -# valid data usage policies - diff --git a/mavecore/validation_new/constants/__init__.py b/mavecore/validation_old/__init__.py similarity index 100% rename from mavecore/validation_new/constants/__init__.py rename to mavecore/validation_old/__init__.py diff --git a/mavecore/validation/constants.py b/mavecore/validation_old/constants.py similarity index 100% rename from mavecore/validation/constants.py rename to mavecore/validation_old/constants.py diff --git a/mavecore/validation/dataset_validators.py b/mavecore/validation_old/dataset_validators.py similarity index 100% rename from mavecore/validation/dataset_validators.py rename to mavecore/validation_old/dataset_validators.py diff --git a/mavecore/validation_new/exceptions.py b/mavecore/validation_old/exceptions.py similarity index 100% rename from mavecore/validation_new/exceptions.py rename to mavecore/validation_old/exceptions.py diff --git a/mavecore/validation/metadata_validators.py b/mavecore/validation_old/metadata_validators.py similarity index 100% rename from mavecore/validation/metadata_validators.py rename to mavecore/validation_old/metadata_validators.py diff --git a/mavecore/validation/urn_validators.py b/mavecore/validation_old/urn_validators.py similarity index 100% rename from mavecore/validation/urn_validators.py rename to mavecore/validation_old/urn_validators.py diff --git a/mavecore/validation/variant_validators/__init__.py b/mavecore/validation_old/variant_validators/__init__.py similarity index 100% rename from mavecore/validation/variant_validators/__init__.py rename to mavecore/validation_old/variant_validators/__init__.py diff --git a/mavecore/validation/variant_validators/hgvs.py b/mavecore/validation_old/variant_validators/hgvs.py similarity index 100% rename from mavecore/validation/variant_validators/hgvs.py rename to mavecore/validation_old/variant_validators/hgvs.py diff --git a/mavecore/validation/variant_validators/variant.py b/mavecore/validation_old/variant_validators/variant.py similarity index 100% rename from mavecore/validation/variant_validators/variant.py rename to mavecore/validation_old/variant_validators/variant.py From f856f4096147502b2e24aa3240a99b694eef9dbf Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:29:19 -0700 Subject: [PATCH 543/877] edit import --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 39f086b..5c61b23 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -6,7 +6,7 @@ from .identifier import DoiIdentifier, PubmedIdentifier from .target import TargetGene -from mavecore.validation_new.constants.urn import * +from mavecore.validation.constants.urn import * from mavecore.validation.utilities import is_null From 731a1ea439464e96f09e6fb84154ca29e4c30e0f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:29:33 -0700 Subject: [PATCH 544/877] define test case --- tests/test_model_validation/data.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_model_validation/data.py b/tests/test_model_validation/data.py index 2ac41b5..d61e5ea 100644 --- a/tests/test_model_validation/data.py +++ b/tests/test_model_validation/data.py @@ -170,3 +170,16 @@ def test_valid_all_fields(self): "private": True, } ScoreSet.parse_obj(scoreset) + + def test_invalid_keywords(self): + #TODO make sure all required fields are present - as written, this should not pass + scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + #"keywords": ["null"], + } + with self.assertRaises(ValidationError): + ScoreSet.parse_obj(scoreset) + From f2dff74a037c511dda7190ff7fdb4b6c7d2efde1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:29:45 -0700 Subject: [PATCH 545/877] reformat --- mavecore/validation/utilities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 4ea0480..db72ca6 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -1,4 +1,5 @@ -from mavecore.validation.constants import null_values_re +from mavecore.validation.constants.general import null_values_re + def is_null(value): """ From 1a8578b965adf0697071195e7e86e6805b2f2d6c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:27:05 -0600 Subject: [PATCH 546/877] outline dataset validation --- mavecore/validation/dataset.py | 151 ++++++++++++++++++ .../test_variant_validators/__init__.py | 0 2 files changed, 151 insertions(+) create mode 100644 mavecore/validation/dataset.py delete mode 100644 tests/test_validation/test_variant_validators/__init__.py diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py new file mode 100644 index 0000000..28df7b7 --- /dev/null +++ b/mavecore/validation/dataset.py @@ -0,0 +1,151 @@ +from numpy.testing import assert_array_equal +from mavecore.validation.constants.general import * +from mavecore.validation.exceptions import ValidationError + + +def validate_dataframes(scores=None, counts=None): + """ + Validates scores and counts dataframes for MaveDB upload. This function performs + comprehensive validation. + + Parameters + __________ + scores: pandas.DataFrame + The scores data as a pandas dataframe. + counts: pandas.DataFrame + The counts data as a pandas dataframe. + + Raises + ______ + ValidationError + If any of the validation fails. + """ + validate_no_null_columns_rows(scores) + validate_column_names(scores.columns) + validate_variants(scores) + if counts is not None: + validate_no_null_columns_rows(counts) + validate_column_names(counts.columns) + validate_variants(counts) + validate_dataframes_define_same_variants(scores, counts) + + +def validate_no_null_columns_rows(dataframe): + """ + Checks that there are no null columns or rows in the dataframe. Note that a null + column may still have a valid column name. + + Parameters + __________ + dataframe: pandas.DataFrame + The scores or counts dataframe being validated + + Raises + ______ + ValidationError + If there are null columns or rows in the dataframe + """ + pass + + +def validate_column_names(columns): + """ + This function validates the columns in a dataframe. The first columns should be + an hgvs column such as hgvs_nt, hgvs_pro, and hgvs_splice. There should be at least + one column beyond the hgvs columns. A scores dataframe should have a score column and + a counts dataframe should have a counts column. There should not be any null columns. + The column names will also be validated against unusual file conversions that could + corrupt the column names. + + Parameters + __________ + dataframe: pandas.DataFrame + The scores or counts dataframe to be validated. + + Raises + ______ + ValidationError + If the column names are not formatted correctly. + """ + # first columns should be hgvs columns + # there should be at least one additional column beyond the hgvs columns + # there should not be any null columns + # validate against UTF-8byte ordering marks + pass + + +def validate_variants(dataframe): + """ + + :param dataframe: + :return: + """ + # variant strings will be cast into hgvs variant objects to validate + # variants should align with the hgvs column names + pass + + +def validate_variants_match_hgvs_column_name(dataframe): + """ + + :param dataframe: + :return: + """ + pass + + +def validate_hgvs_columns_define_same_variants(nt=None, pro=None): + """ + Checks that, when both hgvs_nt and hgvs_pro columns exist, the variant strings within + those columns are representing the same change. + + Parameters + __________ + nt: list + The hgvs_nt column represented as a list. + pro: list + The hgvs_pro column represented as a list. + + Raises + ______ + ValidationError + If any of the variants within each column do not represent the same change. + """ + pass + + +def validate_dataframes_define_same_variants(scores, counts): + """ + Checks if two `pd.DataFrame` objects parsed from uploaded files + define the same variants. + + Parameters + ---------- + scores: pandas.DataFrame + Scores dataframe parsed from an uploaded scores file. + counts: pandas.DataFrame + Scores dataframe parsed from an uploaded counts file. + + Raises + ______ + ValidationError + If score and counts files do not define the same variants. + """ + try: + assert_array_equal( + scores[hgvs_nt_column].sort_values().values, + counts[hgvs_nt_column].sort_values().values, + ) + assert_array_equal( + scores[hgvs_splice_column].sort_values().values, + counts[hgvs_splice_column].sort_values().values, + ) + assert_array_equal( + scores[hgvs_pro_column].sort_values().values, + counts[hgvs_pro_column].sort_values().values, + ) + except AssertionError: + raise ValidationError( + "Your score and counts files do not define the same variants. " + "Check that the hgvs columns in both files match." + ) diff --git a/tests/test_validation/test_variant_validators/__init__.py b/tests/test_validation/test_variant_validators/__init__.py deleted file mode 100644 index e69de29..0000000 From 1fcccf0ace4d54cdbc8c5b837366e8a813c6716b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:27:31 -0600 Subject: [PATCH 547/877] add init file --- tests/test_validation_old/test_variant_validators/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_validation_old/test_variant_validators/__init__.py diff --git a/tests/test_validation_old/test_variant_validators/__init__.py b/tests/test_validation_old/test_variant_validators/__init__.py new file mode 100644 index 0000000..e69de29 From 68ad9515f884f2029fda9b5a647173f3b2b26e4b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:27:50 -0600 Subject: [PATCH 548/877] outline dataset validation test cases --- tests/test_validation/dataset.py | 54 ++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/test_validation/dataset.py diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py new file mode 100644 index 0000000..4062033 --- /dev/null +++ b/tests/test_validation/dataset.py @@ -0,0 +1,54 @@ +from unittest import TestCase +from mavecore.validation.exceptions import ValidationError + + +class TestValidateNoNullColumnsRows(TestCase): + def test_valid(self): + pass + + def test_null_row(self): + pass + + def test_null_column(self): + pass + + +class TestValidateColumnNames(TestCase): + def test_valid_column_names(self): + pass + + def test_missing_hgvs_column(self): + pass + + def test_hgvs_in_wrong_location(self): + pass + + def test_no_additional_columns_beyond_hgvs(self): + pass + + def test_null_column_name(self): + pass + + +class TestValidateVariants(TestCase): + def test_valid_variants(self): + pass + + def test_invalid_variants(self): + pass + + +class TestVariantsMatchHgvsColumnNames(TestCase): + def test_valid(self): + pass + + def test_mismatched_variants_and_column_names(self): + pass + + +class TestDataframesDefineSameVariants(TestCase): + def test_valid(self): + pass + + def test_dataframes_do_not_define_same_variants(self): + pass \ No newline at end of file From 2c22a45b97145ff30586035aa50f60408ed06b7e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 29 Aug 2022 15:28:01 -0600 Subject: [PATCH 549/877] add init file --- tests/test_validation_old/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_validation_old/__init__.py diff --git a/tests/test_validation_old/__init__.py b/tests/test_validation_old/__init__.py new file mode 100644 index 0000000..e69de29 From 8243f17e6a629bf8b0dbcbfb7f0e9b38c8ac10d7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:40:23 -0700 Subject: [PATCH 550/877] edit imports --- mavecore/validation/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 28df7b7..5fa9563 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -1,4 +1,6 @@ from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal +from mavehgvs import Variant from mavecore.validation.constants.general import * from mavecore.validation.exceptions import ValidationError From e289bfbafe7361f3f9aec9283fc42f13ee556ab5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:40:46 -0700 Subject: [PATCH 551/877] correct function name --- mavecore/validation/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 5fa9563..192926f 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -22,11 +22,11 @@ def validate_dataframes(scores=None, counts=None): ValidationError If any of the validation fails. """ - validate_no_null_columns_rows(scores) + validate_no_null_columns_or_rows(scores) validate_column_names(scores.columns) validate_variants(scores) if counts is not None: - validate_no_null_columns_rows(counts) + validate_no_null_columns_or_rows(counts) validate_column_names(counts.columns) validate_variants(counts) validate_dataframes_define_same_variants(scores, counts) From bf32798ea80cf35c5f9bef451a450474ae395d5c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:40:58 -0700 Subject: [PATCH 552/877] edit function signature --- mavecore/validation/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 192926f..994135b 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -32,7 +32,7 @@ def validate_dataframes(scores=None, counts=None): validate_dataframes_define_same_variants(scores, counts) -def validate_no_null_columns_rows(dataframe): +def validate_no_null_columns_or_rows(dataframe): """ Checks that there are no null columns or rows in the dataframe. Note that a null column may still have a valid column name. From 019fa386a817bfb1a2acadf37646a2d582f5ad82 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:41:19 -0700 Subject: [PATCH 553/877] write function body --- mavecore/validation/dataset.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 994135b..92d8f76 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -47,7 +47,12 @@ def validate_no_null_columns_or_rows(dataframe): ValidationError If there are null columns or rows in the dataframe """ - pass + df = dataframe.dropna(axis=0, how='all') + df = df.dropna(axis=1, how='all') + try: + assert_frame_equal(df, dataframe) + except AssertionError: + raise ValidationError("Dataset should not contain null columns or rows.") def validate_column_names(columns): From 0de8b73804dedccc3e4709b3bfc95684c6043027 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:41:55 -0700 Subject: [PATCH 554/877] validate column names function body --- mavecore/validation/dataset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 92d8f76..5f5a6ac 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -74,9 +74,21 @@ def validate_column_names(columns): ValidationError If the column names are not formatted correctly. """ + # count instances of hgvs columns + count = 0 + for i in range(len(columns)): + # there should not be any null columns + if columns[i] in readable_null_values_list: raise ValidationError("Column names must not be null.") + if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: count+=1 + # there should be at least one hgvs column + if count == 0: raise ValidationError("Must include hgvs_nt, hgvs_pro, or hgvs_splice column.") # first columns should be hgvs columns + for i in range(count): + if columns[i] not in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: + raise ValidationError("First columns must be hgvs columns.") # there should be at least one additional column beyond the hgvs columns - # there should not be any null columns + if len(columns) == count: + raise ValidationError("There must be at least one additional column beyond the hgvs columns.") # validate against UTF-8byte ordering marks pass From a03872bc5029253245faebae46032c472a21e1b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:42:01 -0700 Subject: [PATCH 555/877] validate column names function body --- mavecore/validation/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 5f5a6ac..7d23f8d 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -90,7 +90,6 @@ def validate_column_names(columns): if len(columns) == count: raise ValidationError("There must be at least one additional column beyond the hgvs columns.") # validate against UTF-8byte ordering marks - pass def validate_variants(dataframe): From 326c98043d461427707629b6c9e687fe21839f85 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:42:13 -0700 Subject: [PATCH 556/877] edit function signature --- mavecore/validation/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 7d23f8d..030886c 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -92,7 +92,7 @@ def validate_column_names(columns): # validate against UTF-8byte ordering marks -def validate_variants(dataframe): +def validate_variants(variants, column_name=None): """ :param dataframe: From 92d7e0fbb4be56270368cfdd6fa07b0c3ca3dcd0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:42:29 -0700 Subject: [PATCH 557/877] validate variants docstring --- mavecore/validation/dataset.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 030886c..4ac010c 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -94,9 +94,20 @@ def validate_column_names(columns): def validate_variants(variants, column_name=None): """ + Validates a string of variants and verifies that the variant type in the column name makes + sense with regards to the actual variants. - :param dataframe: - :return: + Parameters + __________ + variants: list[str] + List of mavehgvs formatted strings. + column_name: str + The hgvs column name from which the variants parameter originates. + + Raises + ______ + ValidationError + If any variant in the list of variants does not adhere to the mavehgvs specifications. """ # variant strings will be cast into hgvs variant objects to validate # variants should align with the hgvs column names From 08115efb43e25dbeebc78b01f6cb7f8ceddbd0d1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:42:40 -0700 Subject: [PATCH 558/877] validate variants function body --- mavecore/validation/dataset.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 4ac010c..7ea3c21 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -110,8 +110,14 @@ def validate_variants(variants, column_name=None): If any variant in the list of variants does not adhere to the mavehgvs specifications. """ # variant strings will be cast into hgvs variant objects to validate - # variants should align with the hgvs column names - pass + for variant in variants: + try: + v = Variant(variant) + # variants should align with the hgvs column names + # check this by seeing if the prefix makes sense with regards to the hgvs column name + validate_variant_matches_hgvs_column_name(column_name, v.prefix) + except ValidationError: + raise ValidationError(variant + " does not adhere to mavehgvs variant guidelines.") def validate_variants_match_hgvs_column_name(dataframe): From b034c5b39779b4e2f359a579614a0924735c038d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:42:55 -0700 Subject: [PATCH 559/877] edit function signature --- mavecore/validation/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index 7ea3c21..af5382d 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -120,7 +120,7 @@ def validate_variants(variants, column_name=None): raise ValidationError(variant + " does not adhere to mavehgvs variant guidelines.") -def validate_variants_match_hgvs_column_name(dataframe): +def validate_variant_matches_hgvs_column_name(variant, column_name): """ :param dataframe: From b99a57390d6347f83388e8602e6c390302d5bbd3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:43:12 -0700 Subject: [PATCH 560/877] write docstring for validation function --- mavecore/validation/dataset.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index af5382d..a1ffe41 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -122,9 +122,19 @@ def validate_variants(variants, column_name=None): def validate_variant_matches_hgvs_column_name(variant, column_name): """ + Checks that a variant makes sense with regards to the hgvs column name. - :param dataframe: - :return: + Parameters + __________ + variants: list[str] + List of mavehgvs formatted strings. + column_name: str + The hgvs column name from which the variants parameter originates. + + Raises + ______ + ValidationError + If the variant does not make sense with regards to the hgvs column name. """ pass From dee0fe8c7a9474bfa37c34a13d135641bcc76cc4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:43:23 -0700 Subject: [PATCH 561/877] edit docstring --- mavecore/validation/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index a1ffe41..083a674 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -141,7 +141,7 @@ def validate_variant_matches_hgvs_column_name(variant, column_name): def validate_hgvs_columns_define_same_variants(nt=None, pro=None): """ - Checks that, when both hgvs_nt and hgvs_pro columns exist, the variant strings within + Checks that, when two or more of hgvs_nt, hgvs_pro, and hgvs_splice columns exist, the variant strings within those columns are representing the same change. Parameters From 7c53239f3f2867e792148393d0ad64e17bd6fbd5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:43:45 -0700 Subject: [PATCH 562/877] edit imports --- tests/test_validation/dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 4062033..db6956a 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -1,5 +1,9 @@ from unittest import TestCase +import pandas as pd + +from mavecore.validation.constants import general from mavecore.validation.exceptions import ValidationError +from mavecore.validation.dataset import * class TestValidateNoNullColumnsRows(TestCase): From 9709963088b100d1202e94a09889a624389d8f1e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:44:23 -0700 Subject: [PATCH 563/877] edit test class name --- tests/test_validation/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index db6956a..d6b7ea1 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -6,7 +6,7 @@ from mavecore.validation.dataset import * -class TestValidateNoNullColumnsRows(TestCase): +class TestValidateNoNullColumnsOrRows(TestCase): def test_valid(self): pass From 68fd00bf573c29dcdbca4b33d84be6eb6d5f0bd2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:44:33 -0700 Subject: [PATCH 564/877] validate valid df --- tests/test_validation/dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index d6b7ea1..860a5c1 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -8,7 +8,14 @@ class TestValidateNoNullColumnsOrRows(TestCase): def test_valid(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + general.hgvs_pro_column: ["p.Leu5Glu"], + general.hgvs_splice_column: ["c.1A>G"], + } + ) + validate_no_null_columns_or_rows(dataframe) def test_null_row(self): pass From 3fb9dde082cfe6e2934036fb2b46d2a658d17423 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:44:45 -0700 Subject: [PATCH 565/877] validate null row df --- tests/test_validation/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 860a5c1..6987fe2 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -18,7 +18,15 @@ def test_valid(self): validate_no_null_columns_or_rows(dataframe) def test_null_row(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G", None], + general.hgvs_pro_column: ["p.Leu5Glu", None], + general.hgvs_splice_column: ["c.1A>G", None], + } + ) + with self.assertRaises(AssertionError): + validate_no_null_columns_or_rows(dataframe) def test_null_column(self): pass From c629496f832035242b5d21c5fc23bcc70cddd6b5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:44:53 -0700 Subject: [PATCH 566/877] validate null column df --- tests/test_validation/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 6987fe2..7469624 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -29,7 +29,15 @@ def test_null_row(self): validate_no_null_columns_or_rows(dataframe) def test_null_column(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G", None], + general.hgvs_pro_column: [None, None], + general.hgvs_splice_column: ["c.1A>G", None], + } + ) + with self.assertRaises(AssertionError): + validate_no_null_columns_or_rows(dataframe) class TestValidateColumnNames(TestCase): From 536a643802aa2110b1a63ae614cca76f4782c365 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:45:08 -0700 Subject: [PATCH 567/877] validate valid column names --- tests/test_validation/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 7469624..1730385 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -42,7 +42,15 @@ def test_null_column(self): class TestValidateColumnNames(TestCase): def test_valid_column_names(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + general.hgvs_pro_column: ["p.Leu5Glu"], + general.hgvs_splice_column: ["c.1A>G"], + "scores": [1.000], + } + ) + validate_column_names(dataframe.columns) def test_missing_hgvs_column(self): pass From 62350f1cb2b54c708225b25e643c7d7ffa4a1dd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:45:21 -0700 Subject: [PATCH 568/877] validate missing column names --- tests/test_validation/dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 1730385..6dee416 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -53,7 +53,13 @@ def test_valid_column_names(self): validate_column_names(dataframe.columns) def test_missing_hgvs_column(self): - pass + dataframe = pd.DataFrame( + { + "scores": [1.000], + } + ) + with self.assertRaises(ValidationError): + validate_column_names(dataframe.columns) def test_hgvs_in_wrong_location(self): pass From a4997f01429a41e18c23946af7e0e946bdc2d1f1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:45:37 -0700 Subject: [PATCH 569/877] validate incorrect column name order --- tests/test_validation/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 6dee416..25c1cb7 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -62,7 +62,15 @@ def test_missing_hgvs_column(self): validate_column_names(dataframe.columns) def test_hgvs_in_wrong_location(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + "scores": [1.000], + general.hgvs_splice_column: ["c.1A>G"], + } + ) + with self.assertRaises(ValidationError): + validate_column_names(dataframe.columns) def test_no_additional_columns_beyond_hgvs(self): pass From c8713fed6f07d20dd457a70d6cbc7364c222d9dd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:45:51 -0700 Subject: [PATCH 570/877] validate missing column names --- tests/test_validation/dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 25c1cb7..e3eabe5 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -73,7 +73,13 @@ def test_hgvs_in_wrong_location(self): validate_column_names(dataframe.columns) def test_no_additional_columns_beyond_hgvs(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + } + ) + with self.assertRaises(ValidationError): + validate_column_names(dataframe.columns) def test_null_column_name(self): pass From 9f0911685578278cbdab1a860aa8bffc2c67b982 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:45:59 -0700 Subject: [PATCH 571/877] validate null column names --- tests/test_validation/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index e3eabe5..110c95c 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -82,7 +82,15 @@ def test_no_additional_columns_beyond_hgvs(self): validate_column_names(dataframe.columns) def test_null_column_name(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + "null": ["c.1A>G"], + "scores": [1.000], + } + ) + with self.assertRaises(ValidationError): + validate_column_names(dataframe.columns) class TestValidateVariants(TestCase): From 9d5fa801f7ace5b1e4f8a502378851629fa2e74b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:46:18 -0700 Subject: [PATCH 572/877] test valid variants --- tests/test_validation/dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 110c95c..ad3bbef 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -95,7 +95,14 @@ def test_null_column_name(self): class TestValidateVariants(TestCase): def test_valid_variants(self): - pass + dataframe = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G", "c.1A>G", "c.1A>G"], + general.hgvs_pro_column: ["p.Leu5Glu", "p.Leu5Glu", "p.Leu5Glu"], + general.hgvs_splice_column: ["c.1A>G", "c.1A>G", "c.1A>G"], + } + ) + validate_variants(dataframe["hgvs_nt"]) def test_invalid_variants(self): pass From 75056ef8a226add9689e8ca276a32190218bbd44 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:46:29 -0700 Subject: [PATCH 573/877] define test class --- tests/test_validation/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index ad3bbef..2dcda8e 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -116,6 +116,11 @@ def test_mismatched_variants_and_column_names(self): pass +class TestHgvsColumnsDefineSameVariants(TestCase): + def test_valid(self): + pass + + class TestDataframesDefineSameVariants(TestCase): def test_valid(self): pass From 99cec51e9d79e324c9a19e6afa69717ae5bbc2a6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:47:20 -0700 Subject: [PATCH 574/877] test dataframes define same variants --- tests/test_validation/dataset.py | 71 ++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataset.py index 2dcda8e..677d416 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataset.py @@ -123,7 +123,72 @@ def test_valid(self): class TestDataframesDefineSameVariants(TestCase): def test_valid(self): - pass + scores = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + general.hgvs_pro_column: ["p.Leu5Glu"], + general.hgvs_splice_column: ["c.1A>G"], + } + ) + counts = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + general.hgvs_pro_column: ["p.Leu5Glu"], + general.hgvs_splice_column: ["c.1A>G"], + } + ) + validate_dataframes_define_same_variants(scores, counts) - def test_dataframes_do_not_define_same_variants(self): - pass \ No newline at end of file + def test_counts_defines_different_nt_variants(self): + scores = pd.DataFrame( + { + general.hgvs_nt_column: ["c.1A>G"], + general.hgvs_pro_column: [None], + general.hgvs_splice_column: [None], + } + ) + counts = pd.DataFrame( + { + general.hgvs_nt_column: ["c.2A>G"], + general.hgvs_pro_column: [None], + general.hgvs_splice_column: [None], + } + ) + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(scores, counts) + + def test_counts_defines_different_splice_variants(self): + scores = pd.DataFrame( + { + general.hgvs_nt_column: [None], + general.hgvs_splice_column: ["c.1A>G"], + general.hgvs_pro_column: [None], + } + ) + counts = pd.DataFrame( + { + general.hgvs_nt_column: [None], + general.hgvs_splice_column: ["c.2A>G"], + general.hgvs_pro_column: [None], + } + ) + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(scores, counts) + + def test_counts_defines_different_pro_variants(self): + scores = pd.DataFrame( + { + general.hgvs_nt_column: [None], + general.hgvs_splice_column: [None], + general.hgvs_pro_column: ["p.Leu5Glu"], + } + ) + counts = pd.DataFrame( + { + general.hgvs_nt_column: [None], + general.hgvs_splice_column: [None], + general.hgvs_pro_column: ["p.Leu75Glu"], + } + ) + with self.assertRaises(ValidationError): + validate_dataframes_define_same_variants(scores, counts) \ No newline at end of file From c070b4f636a3b8f33fe147026fbeced9f3727dd9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:47:39 -0700 Subject: [PATCH 575/877] edit inheritance of custom validation error --- mavecore/validation/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index b3e419b..7e5f8eb 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -1,5 +1,5 @@ NON_FIELD_ERRORS = "__all__" -class ValidationError(ValueError): +class ValidationError(ValueError, AssertionError): None \ No newline at end of file From 5a130130602051602c031e31f2d3a37160d07b45 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:48:30 -0700 Subject: [PATCH 576/877] add to and edit null values list construction --- mavecore/validation/constants/general.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/constants/general.py b/mavecore/validation/constants/general.py index 71e710b..057a12e 100644 --- a/mavecore/validation/constants/general.py +++ b/mavecore/validation/constants/general.py @@ -15,9 +15,11 @@ "n/a", "null", "nil", + "-", + None, ) # enforce the assumption that these are all lowercase values -null_values_list = [s.lower() for s in null_values_list] +null_values_list = [s.lower() for s in null_values_list if s is not None] # add the NA_STRING only if it's not already in the list if NA_STRING.lower() not in null_values_list: null_values_list.append(NA_STRING.lower()) From bc2fd0fddce2400471580ac2a8b796703840d24e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:48:35 -0700 Subject: [PATCH 577/877] add to and edit null values list construction --- mavecore/validation/constants/general.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/constants/general.py b/mavecore/validation/constants/general.py index 057a12e..3b0abe0 100644 --- a/mavecore/validation/constants/general.py +++ b/mavecore/validation/constants/general.py @@ -30,7 +30,7 @@ flags=re.IGNORECASE, ) -readable_null_values_list = [f"'{s}'" for s in null_values_list] + ["whitespace"] +readable_null_values_list = [f"{s}" for s in null_values_list] + ["whitespace"] hgvs_nt_column = "hgvs_nt" hgvs_splice_column = "hgvs_splice" From 571acab387839cfb42e2429cb8cc649521fdb673 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 8 Sep 2022 14:49:38 -0700 Subject: [PATCH 578/877] move files --- .../test_dataset_validators.py | 0 .../test_metadata_validators.py | 0 .../test_variant_validators/test_hgvs_validators.py | 0 .../test_variant_validators/test_validators.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_validation => test_validation_old}/test_dataset_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_metadata_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_variant_validators/test_hgvs_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_variant_validators/test_validators.py (100%) diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation_old/test_dataset_validators.py similarity index 100% rename from tests/test_validation/test_dataset_validators.py rename to tests/test_validation_old/test_dataset_validators.py diff --git a/tests/test_validation/test_metadata_validators.py b/tests/test_validation_old/test_metadata_validators.py similarity index 100% rename from tests/test_validation/test_metadata_validators.py rename to tests/test_validation_old/test_metadata_validators.py diff --git a/tests/test_validation/test_variant_validators/test_hgvs_validators.py b/tests/test_validation_old/test_variant_validators/test_hgvs_validators.py similarity index 100% rename from tests/test_validation/test_variant_validators/test_hgvs_validators.py rename to tests/test_validation_old/test_variant_validators/test_hgvs_validators.py diff --git a/tests/test_validation/test_variant_validators/test_validators.py b/tests/test_validation_old/test_variant_validators/test_validators.py similarity index 100% rename from tests/test_validation/test_variant_validators/test_validators.py rename to tests/test_validation_old/test_variant_validators/test_validators.py From 4970eaafec0aa5ff2a5aeb72bd4334f0e66540a4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:15:40 -0700 Subject: [PATCH 579/877] delete files --- mavecore/models/genome.py | 19 ------------------- mavecore/models/user.py | 16 ---------------- tests/test_model_validation/genome.py | 15 --------------- tests/test_model_validation/user.py | 24 ------------------------ 4 files changed, 74 deletions(-) delete mode 100644 mavecore/models/genome.py delete mode 100644 mavecore/models/user.py delete mode 100644 tests/test_model_validation/genome.py delete mode 100644 tests/test_model_validation/user.py diff --git a/mavecore/models/genome.py b/mavecore/models/genome.py deleted file mode 100644 index 4aaf338..0000000 --- a/mavecore/models/genome.py +++ /dev/null @@ -1,19 +0,0 @@ -from pydantic import BaseModel, ValidationError, validator -from datetime import datetime -from typing import Optional - - -class Genome(BaseModel): - shortName: str - organismName: str - genomeId: int - creationDate: Optional[datetime] - modificationDate: Optional[datetime] - id: int - - @validator('creationDate', 'modificationDate') - def date_must_match_regex(cls, v): - # regular expression for validating a date - regex = '%Y-%m-%d' - if not bool(datetime.strptime(v, regex)): - raise ValidationError("{}'s is not a valid date.".format(v)) diff --git a/mavecore/models/user.py b/mavecore/models/user.py deleted file mode 100644 index ca71f4f..0000000 --- a/mavecore/models/user.py +++ /dev/null @@ -1,16 +0,0 @@ -from pydantic import BaseModel, ValidationError, validator -import re - - -class User(BaseModel): - orcid_id: str - firstName: str - lastName: str - email: str - - @validator('email') - def check_email_has_valid_structure(cls, v): - # regular expression for validating an Email - regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' - if not (re.fullmatch(regex, v)): - raise ValidationError("{}'s is not a valid email.".format(v)) diff --git a/tests/test_model_validation/genome.py b/tests/test_model_validation/genome.py deleted file mode 100644 index 9c2be1d..0000000 --- a/tests/test_model_validation/genome.py +++ /dev/null @@ -1,15 +0,0 @@ -from unittest import TestCase -from mavecore.models.user import User -from mavecore.models.genome import Genome - - -class TestGenome(TestCase): - def test_valid(self): - genome = { - "shortName": "name", - "organismName": "organism", - "genomeId": 0, - "id": 0, - } - Genome.parse_obj(genome) - diff --git a/tests/test_model_validation/user.py b/tests/test_model_validation/user.py deleted file mode 100644 index 61d5943..0000000 --- a/tests/test_model_validation/user.py +++ /dev/null @@ -1,24 +0,0 @@ -from unittest import TestCase -from pydantic import ValidationError -from mavecore.models.user import User - - -class TestUser(TestCase): - def test_valid_all_fields(self): - user = { - "orcid_id": "idididid", - "firstName": "first", - "lastName": "last", - "email": "firstlast@email.edu", - } - User.parse_obj(user) - - def test_invalid_email(self): - user = { - "orcid_id": "idididid", - "firstName": "first", - "lastName": "last", - "email": "firstlastemail.edu", - } - with self.assertRaises(ValidationError): - User.parse_obj(user) From 93d464bb6fc12ef6c3161b574f82e7fbc3959e62 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:17:18 -0700 Subject: [PATCH 580/877] edit imports --- mavecore/models/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 71e2d61..72cb85d 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, ValidationError, validator -from typing import List +from typing import List, Optional from .map import ReferenceMap from .sequence import WildType From adfff76e6f44d649c958145f1afb63a583c05478 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:17:34 -0700 Subject: [PATCH 581/877] add attributes to pydantic model --- mavecore/models/target.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 72cb85d..9c2bc44 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -8,6 +8,9 @@ class TargetGene(BaseModel): name: str category: str + ensembleIdId: Optional[int] + refseqIdId: Optional[int] + uniprotIdId: Optional[int] referenceMaps: List[ReferenceMap] wtSequence: WildType From 3df78fc526400b93646ff455d1cd65cac2998498 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:18:31 -0700 Subject: [PATCH 582/877] rename model test directory --- tests/{test_model_validation => test_models}/__init__.py | 0 tests/{test_model_validation => test_models}/data.py | 0 tests/{test_model_validation => test_models}/identifier.py | 0 tests/{test_model_validation => test_models}/map.py | 0 tests/{test_model_validation => test_models}/sequence.py | 0 tests/{test_model_validation => test_models}/target.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_model_validation => test_models}/__init__.py (100%) rename tests/{test_model_validation => test_models}/data.py (100%) rename tests/{test_model_validation => test_models}/identifier.py (100%) rename tests/{test_model_validation => test_models}/map.py (100%) rename tests/{test_model_validation => test_models}/sequence.py (100%) rename tests/{test_model_validation => test_models}/target.py (100%) diff --git a/tests/test_model_validation/__init__.py b/tests/test_models/__init__.py similarity index 100% rename from tests/test_model_validation/__init__.py rename to tests/test_models/__init__.py diff --git a/tests/test_model_validation/data.py b/tests/test_models/data.py similarity index 100% rename from tests/test_model_validation/data.py rename to tests/test_models/data.py diff --git a/tests/test_model_validation/identifier.py b/tests/test_models/identifier.py similarity index 100% rename from tests/test_model_validation/identifier.py rename to tests/test_models/identifier.py diff --git a/tests/test_model_validation/map.py b/tests/test_models/map.py similarity index 100% rename from tests/test_model_validation/map.py rename to tests/test_models/map.py diff --git a/tests/test_model_validation/sequence.py b/tests/test_models/sequence.py similarity index 100% rename from tests/test_model_validation/sequence.py rename to tests/test_models/sequence.py diff --git a/tests/test_model_validation/target.py b/tests/test_models/target.py similarity index 100% rename from tests/test_model_validation/target.py rename to tests/test_models/target.py From 2a30bc453de96ca9c64dda3d5a9acad6fe951be9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:19:12 -0700 Subject: [PATCH 583/877] rename old test directory --- tests/{test_validation => test_validation_old}/__init__.py | 0 .../test_dataset_validators.py | 0 .../test_metadata_validators.py | 0 .../test_variant_validators/__init__.py | 0 .../test_variant_validators/test_hgvs_validators.py | 0 .../test_variant_validators/test_validators.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_validation => test_validation_old}/__init__.py (100%) rename tests/{test_validation => test_validation_old}/test_dataset_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_metadata_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_variant_validators/__init__.py (100%) rename tests/{test_validation => test_validation_old}/test_variant_validators/test_hgvs_validators.py (100%) rename tests/{test_validation => test_validation_old}/test_variant_validators/test_validators.py (100%) diff --git a/tests/test_validation/__init__.py b/tests/test_validation_old/__init__.py similarity index 100% rename from tests/test_validation/__init__.py rename to tests/test_validation_old/__init__.py diff --git a/tests/test_validation/test_dataset_validators.py b/tests/test_validation_old/test_dataset_validators.py similarity index 100% rename from tests/test_validation/test_dataset_validators.py rename to tests/test_validation_old/test_dataset_validators.py diff --git a/tests/test_validation/test_metadata_validators.py b/tests/test_validation_old/test_metadata_validators.py similarity index 100% rename from tests/test_validation/test_metadata_validators.py rename to tests/test_validation_old/test_metadata_validators.py diff --git a/tests/test_validation/test_variant_validators/__init__.py b/tests/test_validation_old/test_variant_validators/__init__.py similarity index 100% rename from tests/test_validation/test_variant_validators/__init__.py rename to tests/test_validation_old/test_variant_validators/__init__.py diff --git a/tests/test_validation/test_variant_validators/test_hgvs_validators.py b/tests/test_validation_old/test_variant_validators/test_hgvs_validators.py similarity index 100% rename from tests/test_validation/test_variant_validators/test_hgvs_validators.py rename to tests/test_validation_old/test_variant_validators/test_hgvs_validators.py diff --git a/tests/test_validation/test_variant_validators/test_validators.py b/tests/test_validation_old/test_variant_validators/test_validators.py similarity index 100% rename from tests/test_validation/test_variant_validators/test_validators.py rename to tests/test_validation_old/test_variant_validators/test_validators.py From 8fcb3b8294fe7ffc0da0f07aa3e0f3353d6c100f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:20:03 -0700 Subject: [PATCH 584/877] delete unneeded pydantic attributes --- mavecore/models/data.py | 11 ----------- mavecore/models/map.py | 5 ----- 2 files changed, 16 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 5c61b23..101c2fe 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -16,11 +16,6 @@ class DataSet(BaseModel): abstractText: str methodText: str extraMetadata: Optional[Dict] - creationDate: Optional[str] - publishedDate: Optional[str] - modificationDate: Optional[str] - createdBy: Optional[User] - modifiedBy: Optional[User] @validator('creationDate', 'publishedDate', 'modificationDate') def date_must_match_regex(cls, v): @@ -31,13 +26,9 @@ def date_must_match_regex(cls, v): class Experiment(DataSet): - urn: Optional[str] keywords: Optional[List[str]] - numScoresets: Optional[int] - experimentSetUrn: Optional[str] doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] - processingState: Optional[str] @validator('urn') def validate_urn_matches_regex(cls, v): @@ -85,8 +76,6 @@ class ScoreSet(DataSet): doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] targetGene: TargetGene - datasetColumns: Dict - private: bool @validator('urn') def validate_matches_regular_expression(cls, v): diff --git a/mavecore/models/map.py b/mavecore/models/map.py index b3d3c7e..f0070ca 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -6,13 +6,8 @@ class ReferenceMap(BaseModel): - id: int genomeId: int targetId: int - isPrimary: bool - genome: Genome - creationDate: Optional[str] - modificationDate: Optional[str] @validator('creationDate', 'modificationDate') def date_must_match_regex(cls, v): From 943d72b95e151c90456a6b1b07e7ed47aa28fa21 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:20:21 -0700 Subject: [PATCH 585/877] edit imports --- mavecore/models/map.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index f0070ca..fae0c3f 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -2,8 +2,6 @@ from datetime import datetime from typing import Optional -from .genome import Genome - class ReferenceMap(BaseModel): genomeId: int From 6b32afc94496598d5c7203fb949ad33550499ae8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:20:45 -0700 Subject: [PATCH 586/877] delete unneeded pydantic model --- mavecore/models/data.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 101c2fe..f3be290 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -53,18 +53,6 @@ def validate_experiment_set_urn_matches_regex(cls, v): raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) -class ExperimentSet(DataSet): - urn: Optional[str] - id: int - experiments: List[Experiment] - numExperiments: int - - @validator('urn') - def validate_matches_regular_expression(cls, v): - if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) - - class ScoreSet(DataSet): urn: Optional[str] dataUsagePolicy: str From b1d5b01f36b80e6816f747e80021073bc66745e5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 11:21:12 -0700 Subject: [PATCH 587/877] add pydantic model attribute --- mavecore/models/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index f3be290..40f347e 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -59,8 +59,7 @@ class ScoreSet(DataSet): licenceId: int replacesId: Optional[int] keywords: Optional[List[str]] - numVariants: int - experiment: Experiment + experimentUrn: str doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] targetGene: TargetGene From f0b8f07b3b5c590834e85ddba80954c3b627a230 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 13:38:03 -0700 Subject: [PATCH 588/877] validate all --- mavecore/validation/validate.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 mavecore/validation/validate.py diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py new file mode 100644 index 0000000..8b081ac --- /dev/null +++ b/mavecore/validation/validate.py @@ -0,0 +1,32 @@ +from mavecore.models.data import Experiment, ScoreSet +from mavecore.validation.dataframe import validate_dataframes + + +def validate(dataset, dataset_type, scores=None, counts=None): + """ + This function validates data to by uploaded to MaveDB. Descriptive errors will be raised if any of the validation + fails. Scores and counts are optional as this function accepts both experiments and scoresets. + + Parameters + __________ + dataset: dict + The scoreset or experiment to be uploaded. This will be cast into a pydantic object. + dataset_type: str + The type of dataset that the first argument is, either "experiments" or "scoresets". + scores: Pandas.DataFrame + The scores dataframe as a Pandas DataFrame. + counts: Pandas.DataFrame + The counts dataframe as a Pandas DataFrame. + + Raises + ______ + ValueError + If the dataset_type attribute is not a string that reads `experiments` or `scoresets`. + """ + if dataset_type == "experiments": + Experiment.parse_obj(dataset) + elif dataset_type == "scoresets": + ScoreSet.parse_obj(dataset) + validate_dataframes(scores=scores, counts=counts) + else: + raise ValueError("The dataset_type must be a string that reads `experiments` or `scoresets`.") From 4a4499113924733986cb94e10c42893a0b39dc7a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 13:38:19 -0700 Subject: [PATCH 589/877] rename python file --- mavecore/validation/{dataset.py => dataframe.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename mavecore/validation/{dataset.py => dataframe.py} (100%) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataframe.py similarity index 100% rename from mavecore/validation/dataset.py rename to mavecore/validation/dataframe.py From 8426068777151b6db1aa49184e5aa71f4834d4c5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:33:01 -0700 Subject: [PATCH 590/877] edit imports --- tests/test_validation/{dataset.py => dataframe.py} | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) rename tests/test_validation/{dataset.py => dataframe.py} (97%) diff --git a/tests/test_validation/dataset.py b/tests/test_validation/dataframe.py similarity index 97% rename from tests/test_validation/dataset.py rename to tests/test_validation/dataframe.py index 677d416..1117fac 100644 --- a/tests/test_validation/dataset.py +++ b/tests/test_validation/dataframe.py @@ -1,9 +1,7 @@ from unittest import TestCase import pandas as pd -from mavecore.validation.constants import general -from mavecore.validation.exceptions import ValidationError -from mavecore.validation.dataset import * +from mavecore.validation.dataframe import * class TestValidateNoNullColumnsOrRows(TestCase): From 16565b1bf48a79727682b235b04bb46f5e12cf42 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:33:37 -0700 Subject: [PATCH 591/877] add setUp methods to test classes --- tests/test_validation/dataframe.py | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_validation/dataframe.py b/tests/test_validation/dataframe.py index 1117fac..d363ed6 100644 --- a/tests/test_validation/dataframe.py +++ b/tests/test_validation/dataframe.py @@ -5,6 +5,15 @@ class TestValidateNoNullColumnsOrRows(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + } + ) + def test_valid(self): dataframe = pd.DataFrame( { @@ -39,6 +48,16 @@ def test_null_column(self): class TestValidateColumnNames(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + required_score_column: [1.000], + } + ) + def test_valid_column_names(self): dataframe = pd.DataFrame( { @@ -92,6 +111,15 @@ def test_null_column_name(self): class TestValidateVariants(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G", "c.1A>G", "c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu", "p.Leu5Glu", "p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G", "c.1A>G", "c.1A>G"], + } + ) + def test_valid_variants(self): dataframe = pd.DataFrame( { @@ -120,6 +148,22 @@ def test_valid(self): class TestDataframesDefineSameVariants(TestCase): + def setUp(self): + self.scores = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + } + ) + self.counts = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_splice_column: ["c.1A>G"], + } + ) + def test_valid(self): scores = pd.DataFrame( { From 69347d20c024f911c53851885289f5a55089c87b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:34:50 -0700 Subject: [PATCH 592/877] refactor unittests to work with setUp methods --- tests/test_validation/dataframe.py | 151 +++++------------------------ 1 file changed, 22 insertions(+), 129 deletions(-) diff --git a/tests/test_validation/dataframe.py b/tests/test_validation/dataframe.py index d363ed6..f4cebba 100644 --- a/tests/test_validation/dataframe.py +++ b/tests/test_validation/dataframe.py @@ -15,36 +15,17 @@ def setUp(self): ) def test_valid(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - general.hgvs_pro_column: ["p.Leu5Glu"], - general.hgvs_splice_column: ["c.1A>G"], - } - ) - validate_no_null_columns_or_rows(dataframe) + validate_no_null_columns_or_rows(self.dataframe) def test_null_row(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G", None], - general.hgvs_pro_column: ["p.Leu5Glu", None], - general.hgvs_splice_column: ["c.1A>G", None], - } - ) + self.dataframe.loc[1] = [None, None, None] with self.assertRaises(AssertionError): - validate_no_null_columns_or_rows(dataframe) + validate_no_null_columns_or_rows(self.dataframe) def test_null_column(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G", None], - general.hgvs_pro_column: [None, None], - general.hgvs_splice_column: ["c.1A>G", None], - } - ) + self.dataframe[hgvs_pro_column][0] = None with self.assertRaises(AssertionError): - validate_no_null_columns_or_rows(dataframe) + validate_no_null_columns_or_rows(self.dataframe) class TestValidateColumnNames(TestCase): @@ -59,55 +40,27 @@ def setUp(self): ) def test_valid_column_names(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - general.hgvs_pro_column: ["p.Leu5Glu"], - general.hgvs_splice_column: ["c.1A>G"], - "scores": [1.000], - } - ) - validate_column_names(dataframe.columns) + validate_column_names(self.dataframe.columns) def test_missing_hgvs_column(self): - dataframe = pd.DataFrame( - { - "scores": [1.000], - } - ) + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(dataframe.columns) + validate_column_names(self.dataframe.columns) def test_hgvs_in_wrong_location(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - "scores": [1.000], - general.hgvs_splice_column: ["c.1A>G"], - } - ) + self.dataframe = self.dataframe[[hgvs_nt_column, required_score_column, hgvs_pro_column, hgvs_splice_column]] with self.assertRaises(ValidationError): - validate_column_names(dataframe.columns) + validate_column_names(self.dataframe.columns) def test_no_additional_columns_beyond_hgvs(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - } - ) + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(dataframe.columns) + validate_column_names(self.dataframe.columns) def test_null_column_name(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - "null": ["c.1A>G"], - "scores": [1.000], - } - ) + self.dataframe.rename(columns={hgvs_splice_column: 'null'}, inplace=True) with self.assertRaises(ValidationError): - validate_column_names(dataframe.columns) + validate_column_names(self.dataframe.columns) class TestValidateVariants(TestCase): @@ -121,14 +74,7 @@ def setUp(self): ) def test_valid_variants(self): - dataframe = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G", "c.1A>G", "c.1A>G"], - general.hgvs_pro_column: ["p.Leu5Glu", "p.Leu5Glu", "p.Leu5Glu"], - general.hgvs_splice_column: ["c.1A>G", "c.1A>G", "c.1A>G"], - } - ) - validate_variants(dataframe["hgvs_nt"]) + validate_variants(self.dataframe["hgvs_nt"]) def test_invalid_variants(self): pass @@ -165,72 +111,19 @@ def setUp(self): ) def test_valid(self): - scores = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - general.hgvs_pro_column: ["p.Leu5Glu"], - general.hgvs_splice_column: ["c.1A>G"], - } - ) - counts = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - general.hgvs_pro_column: ["p.Leu5Glu"], - general.hgvs_splice_column: ["c.1A>G"], - } - ) - validate_dataframes_define_same_variants(scores, counts) + validate_dataframes_define_same_variants(self.scores, self.counts) def test_counts_defines_different_nt_variants(self): - scores = pd.DataFrame( - { - general.hgvs_nt_column: ["c.1A>G"], - general.hgvs_pro_column: [None], - general.hgvs_splice_column: [None], - } - ) - counts = pd.DataFrame( - { - general.hgvs_nt_column: ["c.2A>G"], - general.hgvs_pro_column: [None], - general.hgvs_splice_column: [None], - } - ) + self.counts[hgvs_nt_column][0] = "c.2A>G" with self.assertRaises(ValidationError): - validate_dataframes_define_same_variants(scores, counts) + validate_dataframes_define_same_variants(self.scores, self.counts) def test_counts_defines_different_splice_variants(self): - scores = pd.DataFrame( - { - general.hgvs_nt_column: [None], - general.hgvs_splice_column: ["c.1A>G"], - general.hgvs_pro_column: [None], - } - ) - counts = pd.DataFrame( - { - general.hgvs_nt_column: [None], - general.hgvs_splice_column: ["c.2A>G"], - general.hgvs_pro_column: [None], - } - ) + self.counts[hgvs_splice_column][0] = "c.2A>G" with self.assertRaises(ValidationError): - validate_dataframes_define_same_variants(scores, counts) + validate_dataframes_define_same_variants(self.scores, self.counts) def test_counts_defines_different_pro_variants(self): - scores = pd.DataFrame( - { - general.hgvs_nt_column: [None], - general.hgvs_splice_column: [None], - general.hgvs_pro_column: ["p.Leu5Glu"], - } - ) - counts = pd.DataFrame( - { - general.hgvs_nt_column: [None], - general.hgvs_splice_column: [None], - general.hgvs_pro_column: ["p.Leu75Glu"], - } - ) + self.counts[hgvs_pro_column][0] = "p.Leu75Glu" with self.assertRaises(ValidationError): - validate_dataframes_define_same_variants(scores, counts) \ No newline at end of file + validate_dataframes_define_same_variants(self.scores, self.counts) \ No newline at end of file From a9f068e6631e1a0b61909b3f63cee1755d9573d5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:34:59 -0700 Subject: [PATCH 593/877] reformat file --- mavecore/validation/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/exceptions.py b/mavecore/validation/exceptions.py index 7e5f8eb..0d18a8b 100644 --- a/mavecore/validation/exceptions.py +++ b/mavecore/validation/exceptions.py @@ -2,4 +2,4 @@ class ValidationError(ValueError, AssertionError): - None \ No newline at end of file + None From a74a2fa2430c616da14475817d609d9969b81929 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:15:39 -0700 Subject: [PATCH 594/877] edit imports --- mavecore/models/data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 40f347e..d5bebcd 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -1,8 +1,6 @@ from pydantic import BaseModel, ValidationError, validator -from datetime import datetime from typing import List, Dict, Optional -from .user import User from .identifier import DoiIdentifier, PubmedIdentifier from .target import TargetGene From e0e1b127c13b56d40f26a135b5eb62a82d23b4d7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:16:14 -0700 Subject: [PATCH 595/877] add dataset pydantic attribute including custom validation --- mavecore/models/data.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index d5bebcd..e3c6498 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -14,13 +14,16 @@ class DataSet(BaseModel): abstractText: str methodText: str extraMetadata: Optional[Dict] + keywords: Optional[List[str]] - @validator('creationDate', 'publishedDate', 'modificationDate') - def date_must_match_regex(cls, v): - # regular expression for validating a date - regex = '%Y-%m-%d' - if not bool(datetime.strptime(v, regex)): - raise ValidationError("{}'s is not a valid date.".format(v)) + @validator('keywords') + def validate_keywords(cls, v): + if is_null(v): + raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) + else: + for keyword in v: + if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) class Experiment(DataSet): From 42e5d7522f13be38f98fb6149fc6091fc3079644 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:16:40 -0700 Subject: [PATCH 596/877] remove pydantic attribute --- mavecore/models/data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index e3c6498..38b3dc6 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -27,7 +27,6 @@ def validate_keywords(cls, v): class Experiment(DataSet): - keywords: Optional[List[str]] doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] @@ -59,7 +58,6 @@ class ScoreSet(DataSet): dataUsagePolicy: str licenceId: int replacesId: Optional[int] - keywords: Optional[List[str]] experimentUrn: str doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] From 450aab2609ddc8b5f692e3b9352564447ea3801c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:16:55 -0700 Subject: [PATCH 597/877] remove custom validation --- mavecore/models/data.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 38b3dc6..9c7d01a 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -30,28 +30,6 @@ class Experiment(DataSet): doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] - @validator('urn') - def validate_urn_matches_regex(cls, v): - regex = MAVEDB_TMP_URN_RE - if not (re.fullmatch(regex, v)): - raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) - #if not (MAVEDB_EXPERIMENTSET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - # raise ValidationError("{}'s is not a valid Experiment Set urn.".format(v)) - - @validator('keywords') - def validate_keywords(cls, v): - if is_null(v): - raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) - else: - for keyword in v: - if is_null(keyword) or not isinstance(keyword, str): - raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) - - @validator('experimentSetUrn') - def validate_experiment_set_urn_matches_regex(cls, v): - if not (MAVEDB_EXPERIMENT_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid Experiment urn.".format(v)) - class ScoreSet(DataSet): urn: Optional[str] From df9dd98a7774d91b69f94d44fb7332131e0aab9d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:17:23 -0700 Subject: [PATCH 598/877] add setUp methods to dataset pydantic unittests --- tests/test_models/data.py | 53 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/test_models/data.py b/tests/test_models/data.py index d61e5ea..6fd9839 100644 --- a/tests/test_models/data.py +++ b/tests/test_models/data.py @@ -4,6 +4,16 @@ class TestDataSet(TestCase): + def setUp(self): + self.dataset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + } + def test_valid_all_fields(self): user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} dataset = { @@ -66,6 +76,20 @@ def test_invalid_modification_date(self): class TestExperiment(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + self.experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + } + def test_valid_all_fields(self): user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} @@ -136,6 +160,35 @@ def test_valid(self): class TestScoreSet(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "DNA", "sequence": "ATCG"} + target = {"name": "name", + "category": "Protein coding", + "ensembleIdId": 0, + "refseqIdId": 0, + "uniprotIdId": 0, + "referenceMaps": [reference_map], + "wtSequence": sequence} + self.scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + # "urn": "urn", + "dataUsagePolicy": "policy", + "licenceId": 0, + "replacesId": 0, + "keywords": ["string"], + "experimentUrn": "urn", + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "targetGene": target, + } + def test_valid_all_fields(self): user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} From 4ae58aa59a95ab2fd8e599c5e1a17836e1b78e77 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:17:34 -0700 Subject: [PATCH 599/877] edit imports --- tests/test_models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_models/data.py b/tests/test_models/data.py index 6fd9839..16abcad 100644 --- a/tests/test_models/data.py +++ b/tests/test_models/data.py @@ -1,6 +1,6 @@ from unittest import TestCase from pydantic import ValidationError -from mavecore.models.data import DataSet, Experiment, ExperimentSet, ScoreSet +from mavecore.models.data import DataSet, Experiment, ScoreSet class TestDataSet(TestCase): From 804ebdd73c96b8b46aab9181ba0a4080eb77646c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:18:14 -0700 Subject: [PATCH 600/877] refactor unittests to work with new setUp methods --- tests/test_models/data.py | 183 +++++--------------------------------- 1 file changed, 21 insertions(+), 162 deletions(-) diff --git a/tests/test_models/data.py b/tests/test_models/data.py index 16abcad..42007d9 100644 --- a/tests/test_models/data.py +++ b/tests/test_models/data.py @@ -15,64 +15,17 @@ def setUp(self): } def test_valid_all_fields(self): - user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} - dataset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "extraMetadata": {}, - "creationDate": "2022-02-02", - "publishedDate": "2022-02-02", - "modificationDate": "2022-02-02", - "createdBy": user, - "modifiedBy": user, - } - DataSet.parse_obj(dataset) + DataSet.parse_obj(self.dataset) def test_valid_exclude_optional(self): - dataset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - } - DataSet.parse_obj(dataset) - - def test_invalid_creation_date(self): - dataset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "creationDate": "2022-02-02-", - } - with self.assertRaises(ValidationError): - DataSet.parse_obj(dataset) - - def test_invalid_published_date(self): - dataset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "publishedDate": "2022-02-02-", - } - with self.assertRaises(ValidationError): - DataSet.parse_obj(dataset) + self.dataset.pop("extraMetadata") + self.dataset.pop("keywords") + DataSet.parse_obj(self.dataset) - def test_invalid_modification_date(self): - dataset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "creationDate": "2022-02-02", - "publishedDate": "2022-02-02", - "modificationDate": "a", - } + def test_invalid_keywords(self): + self.dataset["keywords"] = ["null"] with self.assertRaises(ValidationError): - DataSet.parse_obj(dataset) + Experiment.parse_obj(self.dataset) class TestExperiment(TestCase): @@ -91,72 +44,14 @@ def setUp(self): } def test_valid_all_fields(self): - user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} - doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} - pubmed_identifier = {"identifier": "29785012"} - experiment = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "extraMetadata": {}, - "creationDate": "2022-02-02", - "publishedDate": "2022-02-02", - "modificationDate": "2022-02-02", - "createdBy": user, - "modifiedBy": user, - #"urn": "tmp:070b3886-ed72-4ce9-a574-6754ad00310b", - "keywords": ["string"], - "numScoresets": 0, - #"experimentSetUrn": "urn", - "doiIdentifiers": [doi_identifier], - "pubmedIdentifiers": [pubmed_identifier], - "processingState": "string", - } - Experiment.parse_obj(experiment) + Experiment.parse_obj(self.experiment) def test_valid_exclude_optional(self): - experiment = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - } - Experiment.parse_obj(experiment) - - def test_invalid_keywords(self): - experiment = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "keywords": ["null"], - } - with self.assertRaises(ValidationError): - Experiment.parse_obj(experiment) - - -class TestExperimentSet(TestCase): - def test_valid(self): - user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} - experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} - experimentset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "extraMetadata": {}, - "creationDate": "2022-02-02", - "publishedDate": "2022-02-02", - "modificationDate": "2022-02-02", - "createdBy": user, - "modifiedBy": user, - #"urn": "urn", - "id": 0, - "experiments": [experiment], - "numExperiments": 1, - } - ExperimentSet.parse_obj(experimentset) + self.experiment.pop("extraMetadata") + self.experiment.pop("keywords") + self.experiment.pop("doiIdentifiers") + self.experiment.pop("pubmedIdentifiers") + Experiment.parse_obj(self.experiment) class TestScoreSet(TestCase): @@ -190,49 +85,13 @@ def setUp(self): } def test_valid_all_fields(self): - user = {"orcid_id": "id", "firstName": "first", "lastName": "last", "email": "firstlast@email.edu"} - experiment = {"title": "title", "shortDescription": "short description", "abstractText": "abstract", "methodText": "methods"} - doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} - pubmed_identifier = {"identifier": "29785012"} - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} - sequence = {"sequenceType": "DNA", "sequence": "ATCG"} - target = {"name": "name", "category": "Protein coding", "referenceMaps": [reference_map], "wtSequence": sequence,} - scoreset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - "extraMetadata": {}, - "creationDate": "2022-02-02", - "publishedDate": "2022-02-02", - "modificationDate": "2022-02-02", - "createdBy": user, - "modifiedBy": user, - #"urn": "urn", - "dataUsagePolicy": "policy", - "licenceId": 0, - "replacesId": 0, - "keywords": ["string"], - "numVariants": 0, - "experiment": experiment, - "doiIdentifiers": [doi_identifier], - "pubmedIdentifiers": [pubmed_identifier], - "targetGene": target, - "datasetColumns": {}, - "private": True, - } - ScoreSet.parse_obj(scoreset) + ScoreSet.parse_obj(self.scoreset) - def test_invalid_keywords(self): - #TODO make sure all required fields are present - as written, this should not pass - scoreset = { - "title": "title", - "shortDescription": "short description", - "abstractText": "abstract", - "methodText": "methods", - #"keywords": ["null"], - } - with self.assertRaises(ValidationError): - ScoreSet.parse_obj(scoreset) + def test_valid_exclude_optional(self): + self.scoreset.pop("extraMetadata") + self.scoreset.pop("keywords") + self.scoreset.pop("replacesId") + self.scoreset.pop("doiIdentifiers") + self.scoreset.pop("pubmedIdentifiers") + ScoreSet.parse_obj(self.scoreset) From 55787aec9928e862a3aeae7efb33b18e5eb1b59b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:19:52 -0700 Subject: [PATCH 601/877] refactor rename test directory --- tests/{test_models => models}/__init__.py | 0 tests/{test_models => models}/data.py | 0 tests/{test_models => models}/identifier.py | 0 tests/{test_models => models}/map.py | 0 tests/{test_models => models}/sequence.py | 0 tests/{test_models => models}/target.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_models => models}/__init__.py (100%) rename tests/{test_models => models}/data.py (100%) rename tests/{test_models => models}/identifier.py (100%) rename tests/{test_models => models}/map.py (100%) rename tests/{test_models => models}/sequence.py (100%) rename tests/{test_models => models}/target.py (100%) diff --git a/tests/test_models/__init__.py b/tests/models/__init__.py similarity index 100% rename from tests/test_models/__init__.py rename to tests/models/__init__.py diff --git a/tests/test_models/data.py b/tests/models/data.py similarity index 100% rename from tests/test_models/data.py rename to tests/models/data.py diff --git a/tests/test_models/identifier.py b/tests/models/identifier.py similarity index 100% rename from tests/test_models/identifier.py rename to tests/models/identifier.py diff --git a/tests/test_models/map.py b/tests/models/map.py similarity index 100% rename from tests/test_models/map.py rename to tests/models/map.py diff --git a/tests/test_models/sequence.py b/tests/models/sequence.py similarity index 100% rename from tests/test_models/sequence.py rename to tests/models/sequence.py diff --git a/tests/test_models/target.py b/tests/models/target.py similarity index 100% rename from tests/test_models/target.py rename to tests/models/target.py From 6fa283429e4ed537b32d8b173bfcc5046dc02b94 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:20:22 -0700 Subject: [PATCH 602/877] refactor, rename test directory --- tests/{test_validation => validation}/dataframe.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_validation => validation}/dataframe.py (100%) diff --git a/tests/test_validation/dataframe.py b/tests/validation/dataframe.py similarity index 100% rename from tests/test_validation/dataframe.py rename to tests/validation/dataframe.py From 9b00e493eb59cac16906a7d453fd821cb16e6311 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 16:21:06 -0700 Subject: [PATCH 603/877] comment out custom validation --- mavecore/models/map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index fae0c3f..c68e98a 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -7,9 +7,9 @@ class ReferenceMap(BaseModel): genomeId: int targetId: int - @validator('creationDate', 'modificationDate') + '''@validator('creationDate', 'modificationDate') def date_must_match_regex(cls, v): # regular expression for validating a date regex = '%Y-%m-%d' if not bool(datetime.strptime(v, regex)): - raise ValidationError("{}'s is not a valid date.".format(v)) + raise ValidationError("{}'s is not a valid date.".format(v))''' From dd58d53c893162b0efdfc7be00e742183c7c06d0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:00:19 -0700 Subject: [PATCH 604/877] add setup methods to pydantic model unittest classes --- tests/models/identifier.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index 93dc885..c036fa1 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -4,6 +4,13 @@ class TestIdentifier(TestCase): + def setUp(self): + self.identifier = { + "identifier": "10.1038/s41588-018-0122-z", + "id": 0, + "url": "https://www.uw.edu", + } + def test_valid_all_fields(self): identifier = { "identifier": "10.1038/s41588-018-0122-z", @@ -29,6 +36,13 @@ def test_invalid_url(self): class TestDoiIdentifier(TestCase): + def setUp(self): + self.doi_identifier = { + "identifier": "10.1038/s41588-018-0122-z", + "id": 0, + "url": "https://www.uw.edu", + } + def test_valid_all_fields(self): doi_identifier = { "identifier": "10.1038/s41588-018-0122-z", @@ -46,6 +60,14 @@ def test_invalid_type_of_identifier(self): class TestPubmedIdentifier(TestCase): + def setUp(self): + self.pubmed_identifier = { + "identifier": "29785012", + "id": 0, + "url": "https://www.uw.edu", + "referenceHtml": "referencehtml", + } + def test_valid_all_fields(self): pubmed_identifier = { "identifier": "29785012", From 72dd3b61927cea4f56719a3a854dee829c28947f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:00:51 -0700 Subject: [PATCH 605/877] refactor pydantic unittests to work with setUp methods --- tests/models/identifier.py | 57 +++++++++++--------------------------- 1 file changed, 16 insertions(+), 41 deletions(-) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index c036fa1..894cf68 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -12,27 +12,17 @@ def setUp(self): } def test_valid_all_fields(self): - identifier = { - "identifier": "10.1038/s41588-018-0122-z", - "id": 0, - "url": "https://www.uw.edu", - } - Identifier.parse_obj(identifier) + Identifier.parse_obj(self.identifier) def test_valid_exclude_optional(self): - identifier = { - "identifier": "29785012", - } - Identifier.parse_obj(identifier) + self.identifier.pop("id") + self.identifier.pop("url") + Identifier.parse_obj(self.identifier) def test_invalid_url(self): - identifier = { - "identifier": "29785012", - "id": 0, - "url": "www.uw.edu", - } + self.identifier["url"] = "www.uw.edu" with self.assertRaises(ValidationError): - Identifier.parse_obj(identifier) + Identifier.parse_obj(self.identifier) class TestDoiIdentifier(TestCase): @@ -44,19 +34,12 @@ def setUp(self): } def test_valid_all_fields(self): - doi_identifier = { - "identifier": "10.1038/s41588-018-0122-z", - "id": 0, - "url": "https://www.uw.edu", - } - DoiIdentifier.parse_obj(doi_identifier) + DoiIdentifier.parse_obj(self.doi_identifier) def test_invalid_type_of_identifier(self): - identifier = { - "identifier": "29785012", - } + self.doi_identifier["identifier"] = "29785012" with self.assertRaises(ValidationError): - DoiIdentifier.parse_obj(identifier) + DoiIdentifier.parse_obj(self.doi_identifier) class TestPubmedIdentifier(TestCase): @@ -69,23 +52,15 @@ def setUp(self): } def test_valid_all_fields(self): - pubmed_identifier = { - "identifier": "29785012", - "id": 0, - "url": "https://www.uw.edu", - "referenceHtml": "referencehtml", - } - PubmedIdentifier.parse_obj(pubmed_identifier) + PubmedIdentifier.parse_obj(self.pubmed_identifier) def test_valid_exclude_optional(self): - pubmed_identifier = { - "identifier": "29785012", - } - PubmedIdentifier.parse_obj(pubmed_identifier) + self.pubmed_identifier.pop("id") + self.pubmed_identifier.pop("url") + self.pubmed_identifier.pop("referenceHtml") + PubmedIdentifier.parse_obj(self.pubmed_identifier) def test_invalid_type_of_identifier(self): - identifier = { - "identifier": "10.1038/s41588-018-0122-z", - } + self.pubmed_identifier["identifier"] = "10.1038/s41588-018-0122-z" with self.assertRaises(ValidationError): - PubmedIdentifier.parse_obj(identifier) + PubmedIdentifier.parse_obj(self.pubmed_identifier) From 7ba721d0795a75c33093addbe3fb3c1cbf5f5c75 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 18:04:57 -0700 Subject: [PATCH 606/877] keyword validators --- tests/validation/keyword.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/validation/keyword.py diff --git a/tests/validation/keyword.py b/tests/validation/keyword.py new file mode 100644 index 0000000..629d374 --- /dev/null +++ b/tests/validation/keyword.py @@ -0,0 +1,23 @@ +from unittest import TestCase + +from mavecore.validation.keyword import ( + validate_keywords, + validate_keyword, + validate_keyword_list, +) +from mavecore.validation.exceptions import ValidationError + + +class TestKeywordValidators(TestCase): + """ + Tests that each validator throws the appropriate :class:`ValidationError` + when passed invalid input. + """ + + def test_ve_invalid_keyword(self): + with self.assertRaises(ValidationError): + validate_keyword(555) + + def test_ve_invalid_keyword_in_list(self): + with self.assertRaises(ValidationError): + validate_keyword_list(["protein", 555]) From 714b77178cfc7548738c1903f68d0b9127c2216a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 18:12:14 -0700 Subject: [PATCH 607/877] keyword validators --- mavecore/validation/keyword.py | 49 ++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 mavecore/validation/keyword.py diff --git a/mavecore/validation/keyword.py b/mavecore/validation/keyword.py new file mode 100644 index 0000000..2820927 --- /dev/null +++ b/mavecore/validation/keyword.py @@ -0,0 +1,49 @@ +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.utilities import is_null + + +def validate_keywords(v): + if is_null(v): + raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) + else: + for keyword in v: + if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) + + +def validate_keyword(kw): + """ + This function validates whether or not the kw parameter is valid by + checking that it is a string that is not null. If kw is null + or is not a string, an error is raised. + + Parameters + __________ + kw : str + The keyword to be validated. + + Raises + ______ + ValidationError + If the kw argument is not a valid string. + """ + if is_null(kw) or not isinstance(kw, str): + raise ValidationError( + f"'{kw}' not a valid keyword. Keywords must be valid strings." + ) + + +def validate_keyword_list(values): + """ + This function takes a list of keyword values and validates that each one is valid. + A valid keyword is a non-null string. The validate_keyword function will raise an + ValidationError if any of the keywords are invalid. + + Parameters + __________ + values : list[str] + The list of values to be validated. + """ + for value in values: + if not is_null(value): + validate_keyword(value) \ No newline at end of file From e5b30621771a7ed09e9c2186101d90f09549d64d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 18:28:03 -0700 Subject: [PATCH 608/877] reformat file --- mavecore/validation/{keyword.py => keywords.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename mavecore/validation/{keyword.py => keywords.py} (97%) diff --git a/mavecore/validation/keyword.py b/mavecore/validation/keywords.py similarity index 97% rename from mavecore/validation/keyword.py rename to mavecore/validation/keywords.py index 2820927..17aaa02 100644 --- a/mavecore/validation/keyword.py +++ b/mavecore/validation/keywords.py @@ -46,4 +46,4 @@ def validate_keyword_list(values): """ for value in values: if not is_null(value): - validate_keyword(value) \ No newline at end of file + validate_keyword(value) From 786c4f463fc09cf347040763a5b068166d4caeab Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 18:28:13 -0700 Subject: [PATCH 609/877] edit imports --- tests/validation/{keyword.py => keywords.py} | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) rename tests/validation/{keyword.py => keywords.py} (81%) diff --git a/tests/validation/keyword.py b/tests/validation/keywords.py similarity index 81% rename from tests/validation/keyword.py rename to tests/validation/keywords.py index 629d374..e43efc2 100644 --- a/tests/validation/keyword.py +++ b/tests/validation/keywords.py @@ -1,10 +1,6 @@ from unittest import TestCase -from mavecore.validation.keyword import ( - validate_keywords, - validate_keyword, - validate_keyword_list, -) +from mavecore.validation.keywords import * from mavecore.validation.exceptions import ValidationError From 04ba5a1da8ccb74cb025b55f10774c91c0e5efbc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:24:16 -0700 Subject: [PATCH 610/877] delete files --- mavecore/validation_old/constants.py | 49 --- mavecore/validation_old/dataset_validators.py | 381 ------------------ mavecore/validation_old/exceptions.py | 5 - .../variant_validators/__init__.py | 1 - .../variant_validators/variant.py | 85 ---- tests/test_validation_old/__init__.py | 0 .../test_variant_validators/__init__.py | 0 .../test_hgvs_validators.py | 49 --- 8 files changed, 570 deletions(-) delete mode 100644 mavecore/validation_old/constants.py delete mode 100644 mavecore/validation_old/dataset_validators.py delete mode 100644 mavecore/validation_old/exceptions.py delete mode 100644 mavecore/validation_old/variant_validators/__init__.py delete mode 100644 mavecore/validation_old/variant_validators/variant.py delete mode 100644 tests/test_validation_old/__init__.py delete mode 100644 tests/test_validation_old/test_variant_validators/__init__.py delete mode 100644 tests/test_validation_old/test_variant_validators/test_hgvs_validators.py diff --git a/mavecore/validation_old/constants.py b/mavecore/validation_old/constants.py deleted file mode 100644 index f7bfe47..0000000 --- a/mavecore/validation_old/constants.py +++ /dev/null @@ -1,49 +0,0 @@ -import re - -""" -Null Constant definitions -""" -NA_STRING = "NA" -null_values_list = ( - "nan", - "na", - "none", - "", - "undefined", - "n/a", - "null", - "nil", -) -# enforce the assumption that these are all lowercase values -null_values_list = [s.lower() for s in null_values_list] -# add the NA_STRING only if it's not already in the list -if NA_STRING.lower() not in null_values_list: - null_values_list.append(NA_STRING.lower()) -null_values_list.sort() - -null_values_re = re.compile( - r"^\s+$|" + "|".join(f"^{s}$" for s in null_values_list if len(s)), - flags=re.IGNORECASE, -) - -readable_null_values_list = [f"'{s}'" for s in null_values_list] + ["whitespace"] - -hgvs_nt_column = "hgvs_nt" -hgvs_splice_column = "hgvs_splice" -hgvs_pro_column = "hgvs_pro" -hgvs_columns = sorted([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]) -meta_data = "meta_data" -score_columns = "score_columns" -count_columns = "count_columns" -variant_score_data = "score_data" -variant_count_data = "count_data" -required_score_column = "score" - -valid_dataset_columns = [score_columns, count_columns] -valid_variant_columns = [variant_score_data, variant_count_data] - -variant_to_scoreset_column = { - variant_score_data: score_columns, - variant_count_data: count_columns, -} -scoreset_to_variant_column = {v: k for k, v in variant_to_scoreset_column.items()} diff --git a/mavecore/validation_old/dataset_validators.py b/mavecore/validation_old/dataset_validators.py deleted file mode 100644 index 3ac8d7b..0000000 --- a/mavecore/validation_old/dataset_validators.py +++ /dev/null @@ -1,381 +0,0 @@ -import io -import csv -import re - -from numpy.testing import assert_array_equal - -from mavecore.validation import constants -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.utilities import is_null - - -class WordLimitValidator: - """ - This class validates the word limit set for a given object. - - Attributes - __________ - message : str - Message template to describe how many words a field is limited to. - code : str - code attribute is set to invalid - counter : `re.Pattern` - The regex pattern that will be used to identify the number of words. - """ - - message = "This field is limited to {} words." - code = "invalid" - counter = re.compile(r"\w+\b", flags=re.IGNORECASE) - - def __init__(self, word_limit, message=None, code=None): - """ - This constructor sets the values of the WordLimitValidator class attributes - message, code, and counter. - - Parameters - __________ - word_limit : int - The word limit assigned to the word limit attribute. - message : str - (default = None) The value assigned to the message attribute that is displayed when an error is raised. - code : str - (default = None) The code assigned to the code attribute. - """ - if message is not None: - self.message = message - if code is not None: - self.code = code - self.word_limit = int(word_limit) - - def __call__(self, value): - """ - This special method will raise a ValidationError if the number of times the regex pattern (defined by the - counter attribute) found in the value parameter exceeds the word_limit attribute value. In short, __call__ - checks if the number of words exceeds the word_limit. - - Parameters - __________ - value : str - The string in which the pattern defined in the counter attribute will be found. - - Returns - _______ - If value is not empty or false. - - Raises - ______ - ValidationError - If the number of times the regex pattern (defined by the counter attribute) found in the value parameter - exceeds the word_limit attribute value. - """ - if not value: - return - if len(self.counter.findall(value)) > self.word_limit: - raise ValidationError(self.message.format(self.word_limit)) - - -def read_header_from_io(file, label=None, msg=None): - # TODO - # confirm type for the file parameter - """ - This takes a file and reads the header from that file. - - Parameters - __________ - file : - label : str - (default = None) - msg : str - (default = None) The message that is printed in the event of an error is raised. The value is updated within - the function. - - Returns - _______ - str - The header that was read from io. - - Raises - ______ - ValidationError - If a header could not be parsed from file. Columns must be coma delimited. Column names - with commas must be escaped by enclosing them in double quotes. - """ - if label is None: - label = "uploaded" - - try: - header_line = file.readline() - if isinstance(header_line, bytes): - header_line = header_line.decode() - file.seek(0) - f = io.StringIO(header_line.strip()) - return [h.strip() for h in csv.DictReader(f, delimiter=",").fieldnames] - except Exception: - if not msg: - msg = ( - "A header could not be parsed from your {} file. Make sure" - "Columns are comma delimited. Column names with commas must be" - "escaped by enclosing them in double quotes.".format(label) - ) - raise ValidationError(msg) - - -def validate_has_hgvs_in_header(header, label=None, msg=None): - """ - Determines whether or not hgvs is in a header. - - Parameters - __________ - header : str - The first line of the file being validated. - label : - (default = None) - msg : - (default = None) The message that is printed in the event of an error is raised. The value is updated within - the function. - - Raises - ______ - ValidationError - If the header is empty and there exists a value for the constants.hgvs_columns parameter. - """ - if label is None: - label = "Uploaded" - params = {} - if msg is None: - msg = ( - "Your %(label)s file must define either a nucleotide hgvs column " - "'%(col_nt)s' or a protein hgvs column '%(col_p)s'. " - "Columns are case-sensitive and must be comma delimited." - ) - params = { - "label": label, - "col_nt": constants.hgvs_nt_column, - "col_p": constants.hgvs_pro_column, - } - if not set(header) & set(constants.hgvs_columns): - raise ValidationError(msg) - - -def validate_at_least_one_additional_column(header, label=None, msg=None): - """ - This function checks the passed header to see if there exists additional columns besides the three - specified by constants.hgvs_nt_column, constants.hgvs_splice_column, and constants.hgvs_pro_column. - - Parameters - __________ - header : str - The first line of the file being validated. - label : str - (default = None) - msg : str - (default = None) The message that is printed in the event of an error is raised. The value is updated within - the function. - - Raises - ______ - ValidationError - If there are not additional columns in the header argument. - """ - if label is None: - label = "Uploaded" - params = {} - if not any(v not in constants.hgvs_columns for v in header): - if msg is None: - msg = ( - "Your %(label)s file must define at " - "least one additional column different " - "from '{}', '{}' and '{}'.".format( - constants.hgvs_nt_column, - constants.hgvs_splice_column, - constants.hgvs_pro_column, - ) - ) - params = {"label": label} - raise ValidationError(msg) - - -def validate_header_contains_no_null_columns(header, label=None, msg=None): - """ - This function checks that the header parameter does not contain any null columns that - are not in the case-insensitive null values listed in constants.readable_null_values. - - Parameters - __________ - header : str - The first line of the file being validated. - label : str - (default = None) - msg : str - (default = None) The message that is printed in the event of an error is raised. The value is updated within - the function. - - Raises - ______ - ValidationError - If the file header contains blank/empty/whitespace. Only columns or the - case-insensitive null values listed in constants.readable_null_values - are permitted. - """ - if label is None: - label = "File" - any_null = any([is_null(v) for v in header]) - if any_null: - if msg is None: - msg = ( - "%(label)s file header cannot contain blank/empty/whitespace " - "only columns or the following case-insensitive null " - "values: {}.".format( - label, ", ".join(constants.readable_null_values_list) - ) - ) - raise ValidationError(msg) - - -def validate_datasets_define_same_variants(scores, counts): - """ - Checks if two `pd.DataFrame` objects parsed from uploaded files - define the same variants. - - Parameters - ---------- - scores : `pd.DataFrame` - Scores dataframe parsed from an uploaded scores file. - counts : `pd.DataFrame` - Scores dataframe parsed from an uploaded counts file. - - Raises - ______ - ValidationError - If score and counts files do not define the same variants. - """ - try: - assert_array_equal( - scores[constants.hgvs_nt_column].sort_values().values, - counts[constants.hgvs_nt_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_splice_column].sort_values().values, - counts[constants.hgvs_splice_column].sort_values().values, - ) - assert_array_equal( - scores[constants.hgvs_pro_column].sort_values().values, - counts[constants.hgvs_pro_column].sort_values().values, - ) - except AssertionError: - raise ValidationError( - "Your score and counts files do not define the same variants. " - "Check that the hgvs columns in both files match." - ) - - -def validate_scoreset_score_data_input(file): - """ - Validator function for checking that the scores file input contains - at least the column 'hgvs' and 'score'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - An open file handle in read mode. - - Raises - ______ - ValidationError - If score data file is missing the required column constants.required_score_column - """ - file.seek(0) - header = read_header_from_io(file, label="Score") - validate_header_contains_no_null_columns(header, label="Score") - validate_has_hgvs_in_header(header, label="Score") - validate_at_least_one_additional_column(header, label="Score") - - if constants.required_score_column not in header: - raise ValidationError( - "Score data file is missing the required column " - + constants.required_score_column - + "." - + "Columns are case-sensitive and must be comma delimited." - ) - - -def validate_scoreset_count_data_input(file): - """ - Validator function for checking that the counts file input contains - at least the column 'hgvs'. Returns the file to position 0 - after reading the header (first line). - - Parameters - ---------- - file : :class:`io.FileIO` - File parsed by a `django` form. - """ - file.seek(0) - header = read_header_from_io(file, label="Count") - validate_header_contains_no_null_columns(header, label="Count") - validate_has_hgvs_in_header(header, label="Count") - validate_at_least_one_additional_column(header, label="Count") - - -def validate_scoreset_json(dict_): - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `dataset_columns` attribute in a :class:`ScoreSet` instance. - - Parameters - ---------- - dict_ : dict - Dictionary of keys mapping to a list. - - Raises - ______ - ValidationError - If scoreset data is missing the required key. - ValidationError - If header values are not strings. - ValidationError - If - ValidationError - If missing required column constants.required_score_column for score dataset. - ValidationError - If encountered unexpected keys extras. - """ - required_columns = [constants.score_columns, constants.count_columns] - - for key in required_columns: - if key not in dict_.keys(): - raise ValidationError( - "Scoreset data is missing the required key " + key - ) - - columns = dict_[key] - if not all([isinstance(c, str) for c in columns]): - raise ValidationError("Header values must be strings.") - - if not isinstance(columns, list): - type_ = type(columns).__name__ - raise ValidationError( - "Value for " - + key.replace("_", " ") - + " must be a list not " - + type_ - ) - - # Check score columns is not-empty and at least contains hgvs and score - if key == constants.score_columns: - if constants.required_score_column not in columns: - raise ValidationError( - "Missing required column constants.required_score_column " - "for score dataset." - ) - - # Check there are not unexpected columns supplied to the scoreset json - # field. - extras = [k for k in dict_.keys() if k not in set(required_columns)] - if len(extras) > 0: - extras = [k for k in dict_.keys() if k not in required_columns] - raise ValidationError("Encountered unexpected keys extras") - - diff --git a/mavecore/validation_old/exceptions.py b/mavecore/validation_old/exceptions.py deleted file mode 100644 index b3e419b..0000000 --- a/mavecore/validation_old/exceptions.py +++ /dev/null @@ -1,5 +0,0 @@ -NON_FIELD_ERRORS = "__all__" - - -class ValidationError(ValueError): - None \ No newline at end of file diff --git a/mavecore/validation_old/variant_validators/__init__.py b/mavecore/validation_old/variant_validators/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/mavecore/validation_old/variant_validators/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/mavecore/validation_old/variant_validators/variant.py b/mavecore/validation_old/variant_validators/variant.py deleted file mode 100644 index 140fed4..0000000 --- a/mavecore/validation_old/variant_validators/variant.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import Dict - -from mavecore.validation.constants import ( - variant_score_data, - variant_count_data, - required_score_column, -) -from mavecore.validation.exceptions import ValidationError - - -def validate_columns_match(variant, scoreset) -> None: - """ - Validate that a child matches parents defined columns to keep - data in sync. - - Parameters - __________ - variant : - scoreset : - - Raises - ______ - ValidationError - If variant score columns do not match scoreset score columns. - ValidationError - If variant count columns do not match scoreset count columns. - ValidationError - If try fails within try except block. - """ - try: - if variant.score_columns != scoreset.score_columns: - raise ValidationError( - f"Variant defines score columns '{variant.score_columns}' " - f"but parent defines columns '{scoreset.score_columns}. " - ) - if variant.count_columns != scoreset.count_columns: - raise ValidationError( - f"Variant defines count columns '{variant.count_columns}' " - f"but parent defines columns '{scoreset.count_columns}. " - ) - except KeyError as error: - raise ValidationError(f"Missing key {str(error)}") - - -def validate_variant_json(data: Dict[str, Dict]) -> None: - """ - Checks a given dictionary to ensure that it is suitable to be used - as the `data` attribute in a :class:`Variant` instance. - - Parameters - ---------- - data : dict[str, dict] - Dictionary of keys mapping to a list. - - Raises - ______ - ValidationError - If missing the required key. - ValidationError - If missing the required column in variant's score data. - ValidationError - If encountered unexpected keys. - ValidationError - If value for key is not of type dict. - """ - expected_keys = [variant_score_data, variant_count_data] - for key in expected_keys: - if key not in data.keys(): - raise ValidationError(f"Missing the required key {key}") - - if required_score_column not in data[variant_score_data]: - raise ValidationError( - f"Missing required column '{required_score_column}' in variant's score data." - ) - - extras = [k for k in data.keys() if k not in set(expected_keys)] - if len(extras) > 0: - extras = [k for k in data.keys() if k not in expected_keys] - raise ValidationError("Encountered unexpected keys {extras}") - - # Check the correct data types are given. - for key in expected_keys: - if not isinstance(data[key], dict): - type_ = type(data[key]).__name__ - raise ValidationError(f"Value for '{key}' must be a dict not {type_}.") diff --git a/tests/test_validation_old/__init__.py b/tests/test_validation_old/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_validation_old/test_variant_validators/__init__.py b/tests/test_validation_old/test_variant_validators/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_validation_old/test_variant_validators/test_hgvs_validators.py b/tests/test_validation_old/test_variant_validators/test_hgvs_validators.py deleted file mode 100644 index 944f435..0000000 --- a/tests/test_validation_old/test_variant_validators/test_hgvs_validators.py +++ /dev/null @@ -1,49 +0,0 @@ -# from core.utilities import null_values_list -from unittest import TestCase - -from mavecore.validation.variant_validators import hgvs -from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import null_values_list - - -class TestValidateHgvsString(TestCase): - def test_passes_on_null(self): - for v in null_values_list: - hgvs.validate_hgvs_string(v) - - def test_error_not_str(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string(1.0) - - def test_error_unknown_column(self): - with self.assertRaises(ValueError): - hgvs.validate_hgvs_string("c.1A>G", column="random") - - def test_error_does_not_match_splice(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("g.G4L", column="splice") - - def test_error_nt_is_not_g_when_splice_present(self): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=False) - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=True) - - def test_error_does_not_match_nt(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("p.G4L", column="nt") - - def test_error_does_not_match_pro(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("c.1A>G", column="p") - - def test_raises_on_enrich_special_types(self): - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("_wt") - with self.assertRaises(ValidationError): - hgvs.validate_hgvs_string("_sy") - - def test_validates_valid_hgvs(self): - hgvs.validate_hgvs_string("c.1A>G", column="nt", splice_present=False) - hgvs.validate_hgvs_string("g.1A>G", column="nt", splice_present=True) - hgvs.validate_hgvs_string("c.1A>G", column="splice") - hgvs.validate_hgvs_string("p.(=)", column="p") From 191761c47f62fe4e269177a2db8abb83c6458faa Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:24:45 -0700 Subject: [PATCH 611/877] test variant validation --- tests/validation/variant.py | 101 ++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 tests/validation/variant.py diff --git a/tests/validation/variant.py b/tests/validation/variant.py new file mode 100644 index 0000000..ea35e6d --- /dev/null +++ b/tests/validation/variant.py @@ -0,0 +1,101 @@ +# from core.utilities import null_values_list +from unittest import TestCase + +from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.general import null_values_list + + +class TestValidateHgvsString(TestCase): + def test_passes_on_null(self): + for v in null_values_list: + validate_hgvs_string(v) + + def test_error_not_str(self): + with self.assertRaises(ValidationError): + validate_hgvs_string(1.0) + + def test_error_unknown_column(self): + with self.assertRaises(ValueError): + validate_hgvs_string("c.1A>G", column="random") + + def test_error_does_not_match_splice(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("g.G4L", column="splice") + + def test_error_nt_is_not_g_when_splice_present(self): + validate_hgvs_string("c.1A>G", column="nt", splice_present=False) + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="nt", splice_present=True) + + def test_error_does_not_match_nt(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("p.G4L", column="nt") + + def test_error_does_not_match_pro(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="p") + + def test_raises_on_enrich_special_types(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_wt") + with self.assertRaises(ValidationError): + validate_hgvs_string("_sy") + + def test_validates_valid_hgvs(self): + validate_hgvs_string("c.1A>G", column="nt", splice_present=False) + validate_hgvs_string("g.1A>G", column="nt", splice_present=True) + validate_hgvs_string("c.1A>G", column="splice") + validate_hgvs_string("p.(=)", column="p") + + +class TestHGVSValidator(TestCase): + """ + Tests the function :func:`validate_hgvs_string` to see if it is able + to validate strings which do not comply with the HGVS standard for + coding, non-coding and nucleotide variants and multi-variants. + """ + + def test_validation_error_not_str_or_bytes(self): + with self.assertRaises(ValidationError): + validate_hgvs_string([]) + + def test_does_not_pass_enrich_wt_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_wt") + + def test_does_not_pass_enrich_sy_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("_sy") + + def test_passes_multi(self): + validate_hgvs_string("p.[Lys4Gly;Lys5Phe]", column="p") + validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="nt") + validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="splice") + + def test_error_invalid_hgvs(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("c.ad", column="nt") + + def test_error_invalid_nt_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="nt") + + with self.assertRaises(ValidationError): + validate_hgvs_string("c.1A>G", column="nt", splice_present=True) + + def test_error_invalid_splice_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="splice") + + def test_error_invalid_pro_prefix(self): + with self.assertRaises(ValidationError): + validate_hgvs_string("r.1a>g", column="p") + + def test_converts_bytes_to_string_before_validation(self): + validate_hgvs_string(b"c.427A>G", column="splice") + + def test_return_none_for_null(self): + for c in null_values_list: + self.assertIsNone(validate_hgvs_string(c, column="nt")) + From 0e88db42cf303f06d1db8a441ecd766424e659eb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:24:58 -0700 Subject: [PATCH 612/877] move file --- {mavecore/validation_old => tests/validation}/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {mavecore/validation_old => tests/validation}/__init__.py (100%) diff --git a/mavecore/validation_old/__init__.py b/tests/validation/__init__.py similarity index 100% rename from mavecore/validation_old/__init__.py rename to tests/validation/__init__.py From d02203d797fe4d252ed1bab8a505f8328680653e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:25:15 -0700 Subject: [PATCH 613/877] edit imports --- mavecore/models/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 9c7d01a..7625893 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -6,6 +6,7 @@ from mavecore.validation.constants.urn import * from mavecore.validation.utilities import is_null +from mavecore.validation import keywords, urn class DataSet(BaseModel): From 5874e713117fbb48ee509ed64900dc532b918e03 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:25:43 -0700 Subject: [PATCH 614/877] import keyword pydantic attribute validation --- mavecore/models/data.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 7625893..be6790a 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -19,12 +19,7 @@ class DataSet(BaseModel): @validator('keywords') def validate_keywords(cls, v): - if is_null(v): - raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) - else: - for keyword in v: - if is_null(keyword) or not isinstance(keyword, str): - raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) + keywords.validate_keywords(v) class Experiment(DataSet): From 35ab7fb90c041b462ed8f995907bff792e24069f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:25:50 -0700 Subject: [PATCH 615/877] import urn pydantic attribute validation --- mavecore/models/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index be6790a..507f0f8 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -39,5 +39,4 @@ class ScoreSet(DataSet): @validator('urn') def validate_matches_regular_expression(cls, v): - if not (MAVEDB_SCORESET_URN_RE.match(v) or MAVEDB_TMP_URN_RE.match(v)): - raise ValidationError("{}'s is not a valid score set urn.".format(v)) + urn.validate_mavedb_urn_scoreset(v) From 46217ad15ce2a0c321f61461a1803991718b2e25 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:26:01 -0700 Subject: [PATCH 616/877] edit imports --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 083a674..fbcd287 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -3,6 +3,7 @@ from mavehgvs import Variant from mavecore.validation.constants.general import * from mavecore.validation.exceptions import ValidationError +from mavecore.validation.variant import validate_hgvs_string def validate_dataframes(scores=None, counts=None): From f331d2f9b7e34668d9076e240f21790f47511aff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:26:29 -0700 Subject: [PATCH 617/877] edit dataframe validation --- mavecore/validation/dataframe.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index fbcd287..e9d5d87 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -24,12 +24,14 @@ def validate_dataframes(scores=None, counts=None): If any of the validation fails. """ validate_no_null_columns_or_rows(scores) - validate_column_names(scores.columns) - validate_variants(scores) + hgvs_columns = validate_column_names(scores.columns) + for column in hgvs_columns: + validate_variants(scores[column]) if counts is not None: validate_no_null_columns_or_rows(counts) - validate_column_names(counts.columns) - validate_variants(counts) + hgvs_columns = validate_column_names(counts.columns) + for column in hgvs_columns: + validate_variants(counts[column]) validate_dataframes_define_same_variants(scores, counts) From 1908a21ec5ce6e98c3612ee10c4fbd156a667834 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:26:46 -0700 Subject: [PATCH 618/877] add TODO --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index e9d5d87..c8a53ea 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -93,6 +93,7 @@ def validate_column_names(columns): if len(columns) == count: raise ValidationError("There must be at least one additional column beyond the hgvs columns.") # validate against UTF-8byte ordering marks + # TODO if dataframe is a scores df make sure it has a score column def validate_variants(variants, column_name=None): From e2c7f9f1e20e98c826c802f4794dbb28a2b01976 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:27:09 -0700 Subject: [PATCH 619/877] change hgvs string validation code --- mavecore/validation/dataframe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index c8a53ea..fc490a3 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -115,7 +115,14 @@ def validate_variants(variants, column_name=None): """ # variant strings will be cast into hgvs variant objects to validate for variant in variants: - try: + if column_name == "hgvs_nt": + column = "nt" + elif column_name == "hgvs_pro": + column = "p" + elif column_name == "hgvs_splice": + column = "splice" + validate_hgvs_string(variant, column=column) + '''try: v = Variant(variant) # variants should align with the hgvs column names # check this by seeing if the prefix makes sense with regards to the hgvs column name From ac1b82aca6efaaafc2d8f8a3b3f30e8c2f1f6595 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:27:22 -0700 Subject: [PATCH 620/877] comment --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index fc490a3..536c91a 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -128,7 +128,7 @@ def validate_variants(variants, column_name=None): # check this by seeing if the prefix makes sense with regards to the hgvs column name validate_variant_matches_hgvs_column_name(column_name, v.prefix) except ValidationError: - raise ValidationError(variant + " does not adhere to mavehgvs variant guidelines.") + raise ValidationError(variant + " does not adhere to mavehgvs variant guidelines.")''' def validate_variant_matches_hgvs_column_name(variant, column_name): From e211595810ffd668017a2340f3d3da24fac709a8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:27:43 -0700 Subject: [PATCH 621/877] remove keyword validation --- .../identifier.py} | 38 ------------------- 1 file changed, 38 deletions(-) rename mavecore/{validation_old/metadata_validators.py => validation/identifier.py} (86%) diff --git a/mavecore/validation_old/metadata_validators.py b/mavecore/validation/identifier.py similarity index 86% rename from mavecore/validation_old/metadata_validators.py rename to mavecore/validation/identifier.py index 70b731b..9c44e47 100644 --- a/mavecore/validation_old/metadata_validators.py +++ b/mavecore/validation/identifier.py @@ -31,28 +31,6 @@ def validate_sra_identifier(identifier): ) -def validate_keyword(kw): - """ - This function validates whether or not the kw parameter is valid by - checking that it is a string that is not null. If kw is null - or is not a string, an error is raised. - - Parameters - __________ - kw : str - The keyword to be validated. - - Raises - ______ - ValidationError - If the kw argument is not a valid string. - """ - if is_null(kw) or not isinstance(kw, str): - raise ValidationError( - f"'{kw}' not a valid keyword. Keywords must be valid strings." - ) - - def validate_pubmed_identifier(identifier): """ Validates whether the identifier is a valid PubMed identifier. @@ -163,22 +141,6 @@ def validate_genome_identifier(identifier): ) -def validate_keyword_list(values): - """ - This function takes a list of keyword values and validates that each one is valid. - A valid keyword is a non-null string. The validate_keyword function will raise an - ValidationError if any of the keywords are invalid. - - Parameters - __________ - values : list[str] - The list of values to be validated. - """ - for value in values: - if not is_null(value): - validate_keyword(value) - - def validate_pubmed_list(values): """ Validates whether each identifier in a list of identifiers (values) is a valid PubMed identifier. From 284cda250821c5a4862f00612b8af1c176fb8a1c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:28:00 -0700 Subject: [PATCH 622/877] edit imports --- .../test_metadata_validators.py => validation/identifier.py} | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) rename tests/{test_validation_old/test_metadata_validators.py => validation/identifier.py} (97%) diff --git a/tests/test_validation_old/test_metadata_validators.py b/tests/validation/identifier.py similarity index 97% rename from tests/test_validation_old/test_metadata_validators.py rename to tests/validation/identifier.py index 88c1f5c..b56086a 100644 --- a/tests/test_validation_old/test_metadata_validators.py +++ b/tests/validation/identifier.py @@ -1,15 +1,14 @@ from unittest import TestCase -from mavecore.validation.metadata_validators import ( +from mavecore.validation.identifier import ( validate_doi_identifier, validate_doi_list, - validate_keyword, - validate_keyword_list, validate_pubmed_identifier, validate_pubmed_list, validate_sra_identifier, validate_sra_list, validate_uniprot_identifier, + validate_uniprot_list, validate_refseq_identifier, validate_refseq_list, validate_genome_identifier, From 1a0740035702c795757e5c0332fbf6653c6be0ca Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:28:19 -0700 Subject: [PATCH 623/877] remove keyword test cases --- tests/validation/identifier.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/validation/identifier.py b/tests/validation/identifier.py index b56086a..ff00510 100644 --- a/tests/validation/identifier.py +++ b/tests/validation/identifier.py @@ -130,24 +130,6 @@ def test_ve_invalid_uniprot_id(self): with self.assertRaises(ValidationError): validate_uniprot_identifier("P123") - def test_ve_invalid_uniprot_list(self): - with self.assertRaises(ValidationError): - validate_keyword_list(["protein", 555]) - def test_passes_valid_uniprot_id(self): validate_uniprot_identifier("P01133") - -class TestKeywordValidators(TestCase): - """ - Tests that each validator throws the appropriate :class:`ValidationError` - when passed invalid input. - """ - - def test_ve_invalid_keyword(self): - with self.assertRaises(ValidationError): - validate_keyword(555) - - def test_ve_invalid_keyword_in_list(self): - with self.assertRaises(ValidationError): - validate_keyword_list(["protein", 555]) From a0357a0cdee11fca813f39638cc8a26bb80494f2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:28:42 -0700 Subject: [PATCH 624/877] update keyword validation functions --- mavecore/validation/keywords.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mavecore/validation/keywords.py b/mavecore/validation/keywords.py index 17aaa02..8e56254 100644 --- a/mavecore/validation/keywords.py +++ b/mavecore/validation/keywords.py @@ -4,14 +4,15 @@ def validate_keywords(v): if is_null(v): - raise ValidationError("{} are not valid keywords. Keywords must be a valid list of strings.".format(v)) + raise ValidationError("{} are not valid keywords. Keywords must be a non null list of strings.".format(v)) else: for keyword in v: - if is_null(keyword) or not isinstance(keyword, str): - raise ValidationError("{} not a valid keyword. Keywords must be valid strings.".format(keyword)) + validate_keyword(keyword) + '''if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be non null strings.".format(keyword)) +''' - -def validate_keyword(kw): +def validate_keyword(keyword): """ This function validates whether or not the kw parameter is valid by checking that it is a string that is not null. If kw is null @@ -27,13 +28,15 @@ def validate_keyword(kw): ValidationError If the kw argument is not a valid string. """ - if is_null(kw) or not isinstance(kw, str): + '''if is_null(kw) or not isinstance(kw, str): raise ValidationError( f"'{kw}' not a valid keyword. Keywords must be valid strings." - ) + )''' + if is_null(keyword) or not isinstance(keyword, str): + raise ValidationError("{} not a valid keyword. Keywords must be non null strings.".format(keyword)) -def validate_keyword_list(values): +'''def validate_keyword_list(values): """ This function takes a list of keyword values and validates that each one is valid. A valid keyword is a non-null string. The validate_keyword function will raise an @@ -46,4 +49,4 @@ def validate_keyword_list(values): """ for value in values: if not is_null(value): - validate_keyword(value) + validate_keyword(value)''' From 77661f7b2638f331852fcc6b22de41917e7c2505 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:29:10 -0700 Subject: [PATCH 625/877] delete word limit validator --- .../test_dataset_validators.py | 57 ------------------- 1 file changed, 57 deletions(-) rename tests/{test_validation_old => validation}/test_dataset_validators.py (82%) diff --git a/tests/test_validation_old/test_dataset_validators.py b/tests/validation/test_dataset_validators.py similarity index 82% rename from tests/test_validation_old/test_dataset_validators.py rename to tests/validation/test_dataset_validators.py index 6b4895a..66d3691 100644 --- a/tests/test_validation_old/test_dataset_validators.py +++ b/tests/validation/test_dataset_validators.py @@ -15,66 +15,9 @@ read_header_from_io, validate_scoreset_json, validate_datasets_define_same_variants, - WordLimitValidator, ) -class TestWordLimitValidator(TestCase): - def test_validation_error_more_than_word_limit(self): - with self.assertRaises(ValueError): - n = 5 - WordLimitValidator(n)("Word " * (n + 1)) - - def test_passes_equal_to_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * n) - - def test_passes_less_than_word_limit(self): - n = 5 - WordLimitValidator(n)("Word " * (n - 1)) - - -class TestHeaderFromIO(TestCase): - """ - Tests to ensure that a file in bytes or string mode can be read and then - returned to the start so there are no side effects for later reading the - files. - """ - - def test_can_read_header_from_bytes(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_removes_quotes_from_header(self): - file = BytesIO( - '"{}","score","count,nt"\n'.format(constants.hgvs_nt_column).encode() - ) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count,nt"] - self.assertEqual(expected, header) - - def test_can_read_header_from_string(self): - file = StringIO("{},score,count\n".format(constants.hgvs_nt_column)) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_strips_whitespace(self): - file = StringIO(" {} , score , count\n".format(constants.hgvs_nt_column)) - header = read_header_from_io(file) - expected = [constants.hgvs_nt_column, "score", "count"] - self.assertEqual(expected, header) - - def test_returns_file_position_to_begining(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - read_header_from_io(file) - self.assertEqual( - file.read(), "{},score,count\n".format(constants.hgvs_nt_column).encode() - ) - - class TestNoNullInColumnsValidator(TestCase): """ Tests to ensure that an input file contains no null values in the header From 5f70925333cfffc18859d6f8271a9b2f462b2ec2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:29:19 -0700 Subject: [PATCH 626/877] delete comment --- .../test_variant_validators => validation}/test_validators.py | 1 - 1 file changed, 1 deletion(-) rename tests/{test_validation_old/test_variant_validators => validation}/test_validators.py (99%) diff --git a/tests/test_validation_old/test_variant_validators/test_validators.py b/tests/validation/test_validators.py similarity index 99% rename from tests/test_validation_old/test_variant_validators/test_validators.py rename to tests/validation/test_validators.py index 495ca3e..3aac50c 100644 --- a/tests/test_validation_old/test_variant_validators/test_validators.py +++ b/tests/validation/test_validators.py @@ -10,7 +10,6 @@ from mavecore.validation import constants from mavecore.validation.exceptions import ValidationError -# from ..factories import generate_hgvs, VariantFactory from mavecore.validation.variant_validators import ( MaveDataset, validate_variant_json, From 4623653fafdce36e71f80a4a85c974ed7d5a14fc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:30:20 -0700 Subject: [PATCH 627/877] move variant unittests --- tests/validation/test_validators.py | 86 ----------------------------- 1 file changed, 86 deletions(-) diff --git a/tests/validation/test_validators.py b/tests/validation/test_validators.py index 3aac50c..389d0ff 100644 --- a/tests/validation/test_validators.py +++ b/tests/validation/test_validators.py @@ -42,92 +42,6 @@ def generate_hgvs(prefix: str = "c") -> str: return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" -class TestHGVSValidator(TestCase): - """ - Tests the function :func:`validate_hgvs_string` to see if it is able - to validate strings which do not comply with the HGVS standard for - coding, non-coding and nucleotide variants and multi-variants. - """ - - def test_validation_error_not_str_or_bytes(self): - with self.assertRaises(ValidationError): - validate_hgvs_string([]) - - def test_does_not_pass_enrich_wt_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("_wt") - - def test_does_not_pass_enrich_sy_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("_sy") - - def test_passes_multi(self): - validate_hgvs_string("p.[Lys4Gly;Lys5Phe]", column="p") - validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="nt") - validate_hgvs_string("c.[1A>G;127_128delinsAGC]", column="splice") - - def test_error_invalid_hgvs(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("c.ad", column="nt") - - def test_error_invalid_nt_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="nt") - - with self.assertRaises(ValidationError): - validate_hgvs_string("c.1A>G", column="nt", splice_present=True) - - def test_error_invalid_splice_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="splice") - - def test_error_invalid_pro_prefix(self): - with self.assertRaises(ValidationError): - validate_hgvs_string("r.1a>g", column="p") - - def test_converts_bytes_to_string_before_validation(self): - validate_hgvs_string(b"c.427A>G", column="splice") - - def test_return_none_for_null(self): - for c in constants.null_values_list: - self.assertIsNone(validate_hgvs_string(c, column="nt")) - - -class TestVariantJsonValidator(TestCase): - """ - Tests the validator :func:`validate_variant_json` to check if the correct - errors are thrown if an incorrectly formatted `dictionary` is set - as a the `data` `JSONField` attribute of a :class:`..models.Variant` - instance. - """ - - def test_validation_error_missing_score_data_key(self): - data = {constants.variant_count_data: {}} - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_missing_count_data_key(self): - data = {constants.variant_score_data: {}} - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_contains_unexpected_keys(self): - data = { - "extra": {}, - constants.variant_score_data: {}, - constants.variant_count_data: {}, - } - with self.assertRaises(ValidationError): - validate_variant_json(data) - - def test_validation_error_values_not_dict(self): - data = {constants.variant_score_data: {}, constants.variant_count_data: {}} - for key in data.keys(): - data[key] = [] - with self.assertRaises(ValidationError): - validate_variant_json(data) - data[key] = {} - class TestMaveDataset(TestCase): """ From 56c6c038d66df29227643f26f729b1f99c651515 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:30:30 -0700 Subject: [PATCH 628/877] reformat file --- mavecore/validation/constants/urn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/constants/urn.py b/mavecore/validation/constants/urn.py index 54d9b22..74d6b16 100644 --- a/mavecore/validation/constants/urn.py +++ b/mavecore/validation/constants/urn.py @@ -51,4 +51,4 @@ ) ] ) -MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) \ No newline at end of file +MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) From ee6540c0ba78c4cf3eea6f92e861273dc93b8ece Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:30:54 -0700 Subject: [PATCH 629/877] move constants to constants directory --- .../urn_validators.py => validation/urn.py} | 54 +------------------ 1 file changed, 1 insertion(+), 53 deletions(-) rename mavecore/{validation_old/urn_validators.py => validation/urn.py} (60%) diff --git a/mavecore/validation_old/urn_validators.py b/mavecore/validation/urn.py similarity index 60% rename from mavecore/validation_old/urn_validators.py rename to mavecore/validation/urn.py index 341dba6..bbda1b6 100644 --- a/mavecore/validation_old/urn_validators.py +++ b/mavecore/validation/urn.py @@ -1,58 +1,6 @@ import re from mavecore.validation.exceptions import ValidationError - -MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 -MAVEDB_TMP_URN_DIGITS = 16 -MAVEDB_URN_MAX_LENGTH = 64 -MAVEDB_URN_NAMESPACE = "mavedb" - - -# Temp URN patterns -# --------------------------------------------------------------------------- # -MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( - width=MAVEDB_TMP_URN_DIGITS -) -MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) - - -# Experimentset Pattern/Compiled RE -MAVEDB_EXPERIMENTSET_URN_PATTERN = r"^urn:{namespace}:\d{{{width}}}$".format( - namespace=MAVEDB_URN_NAMESPACE, width=MAVEDB_EXPERIMENTSET_URN_DIGITS -) -MAVEDB_EXPERIMENTSET_URN_RE = re.compile(MAVEDB_EXPERIMENTSET_URN_PATTERN) - -# Experiment Pattern/Compiled RE -MAVEDB_EXPERIMENT_URN_PATTERN = r"{pattern}-([a-z]+|0)$".format( - pattern=MAVEDB_EXPERIMENTSET_URN_PATTERN[:-1] -) -MAVEDB_EXPERIMENT_URN_RE = re.compile(MAVEDB_EXPERIMENT_URN_PATTERN) - -# Scoreset Pattern/Compiled RE -MAVEDB_SCORESET_URN_PATTERN = r"{pattern}-\d+$".format( - pattern=MAVEDB_EXPERIMENT_URN_PATTERN[:-1] -) -MAVEDB_SCORESET_URN_RE = re.compile(MAVEDB_SCORESET_URN_PATTERN) - -# Variant Pattern/Compiled RE -MAVEDB_VARIANT_URN_PATTERN = r"{pattern}#\d+$".format( - pattern=MAVEDB_SCORESET_URN_PATTERN[:-1] -) -MAVEDB_VARIANT_URN_RE = re.compile(MAVEDB_VARIANT_URN_PATTERN) - -# Any Pattern/Compiled RE -MAVEDB_ANY_URN_PATTERN = "|".join( - [ - r"({pattern})".format(pattern=p) - for p in ( - MAVEDB_EXPERIMENTSET_URN_PATTERN, - MAVEDB_EXPERIMENT_URN_PATTERN, - MAVEDB_SCORESET_URN_PATTERN, - MAVEDB_VARIANT_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, - ) - ] -) -MAVEDB_ANY_URN_RE = re.compile(MAVEDB_ANY_URN_PATTERN) +from mavecore.validation.constants.urn import * def validate_mavedb_urn(urn): From 311c61515e124fc1b75deaa6c3814375d5aa7c07 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 19:31:05 -0700 Subject: [PATCH 630/877] edit imports --- .../variant_validators/hgvs.py => validation/variant.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename mavecore/{validation_old/variant_validators/hgvs.py => validation/variant.py} (98%) diff --git a/mavecore/validation_old/variant_validators/hgvs.py b/mavecore/validation/variant.py similarity index 98% rename from mavecore/validation_old/variant_validators/hgvs.py rename to mavecore/validation/variant.py index b423ecf..c01c3c0 100644 --- a/mavecore/validation_old/variant_validators/hgvs.py +++ b/mavecore/validation/variant.py @@ -4,7 +4,7 @@ from mavehgvs import Variant, MaveHgvsParseError from mavecore.validation.exceptions import ValidationError -from mavecore.validation.constants import ( +from mavecore.validation.constants.general import ( hgvs_nt_column, hgvs_splice_column, hgvs_pro_column, From 82da85fa21d51dbd3331de9da514ff59127d9680 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 15 Sep 2022 21:07:09 -0700 Subject: [PATCH 631/877] change method name --- tests/validation/keywords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/validation/keywords.py b/tests/validation/keywords.py index e43efc2..bdd7cad 100644 --- a/tests/validation/keywords.py +++ b/tests/validation/keywords.py @@ -16,4 +16,4 @@ def test_ve_invalid_keyword(self): def test_ve_invalid_keyword_in_list(self): with self.assertRaises(ValidationError): - validate_keyword_list(["protein", 555]) + validate_keywords(["protein", 555]) From dbe9b0a868ca5348dcb56a7d5792cdec74711974 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:15:30 -0700 Subject: [PATCH 632/877] edit imports --- mavecore/models/data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 507f0f8..1c31c05 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -4,8 +4,6 @@ from .identifier import DoiIdentifier, PubmedIdentifier from .target import TargetGene -from mavecore.validation.constants.urn import * -from mavecore.validation.utilities import is_null from mavecore.validation import keywords, urn From 765eecda34495ea90042c5883c613eb70f765140 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:15:57 -0700 Subject: [PATCH 633/877] edit imports --- mavecore/models/data.py | 2 +- mavecore/models/identifier.py | 2 ++ mavecore/models/sequence.py | 2 ++ mavecore/models/target.py | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 1c31c05..a4259a4 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, ValidationError, validator +from pydantic import BaseModel, validator from typing import List, Dict, Optional from .identifier import DoiIdentifier, PubmedIdentifier diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 3eae8f0..73a11a8 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -2,6 +2,8 @@ from typing import Optional import idutils +from mavecore.validation import identifier + class Identifier(BaseModel): identifier: str diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index cf43f9f..eb2db35 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, ValidationError, validator +from mavecore.validation import target + class WildType(BaseModel): sequenceType: str diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 9c2bc44..f0fb058 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -4,6 +4,8 @@ from .map import ReferenceMap from .sequence import WildType +from mavecore.validation import target + class TargetGene(BaseModel): name: str From b36f5832da0fd5451e8f47e7388a5d607303b457 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:17:17 -0700 Subject: [PATCH 634/877] import custom pydantic validation --- mavecore/models/identifier.py | 6 ++---- mavecore/models/sequence.py | 5 +---- mavecore/models/target.py | 5 +---- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 73a11a8..31cae7e 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -15,8 +15,7 @@ class DoiIdentifier(Identifier): @validator('identifier') def must_be_valid_doi(cls, v): - if not idutils.is_doi(v): - raise ValidationError("{} is not a valid DOI identifier.".format(v)) + identifier.validate_doi_identifier(v) class PubmedIdentifier(Identifier): @@ -24,5 +23,4 @@ class PubmedIdentifier(Identifier): @validator('identifier') def must_be_valid_pubmed(cls, v): - if not idutils.is_pmid(v): - raise ValidationError("{} is not a valid PubMed identifier.".format(v)) + identifier.validate_pubmed_identifier(v) diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index eb2db35..3b2ae2f 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -9,7 +9,4 @@ class WildType(BaseModel): @validator('sequenceType') def validate_category(cls, v): - valid_sequence_types = ["Infer", "DNA", "Protein"] - if v not in valid_sequence_types: - raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " - "Infer, DNA, and Protein".format(v)) + target.validate_sequence_category(v) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index f0fb058..588ad9a 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -18,7 +18,4 @@ class TargetGene(BaseModel): @validator('category') def validate_category(cls, v): - valid_categories = ["Protein coding", "Regulatory", "Other noncoding"] - if v not in valid_categories: - raise ValidationError("{}'s is not a valid target category. Valid categories are " - "Protein coding, Regulatory, and Other noncoding".format(v)) + target.validate_target_category(v) From b3044abdb6aaaa59c3957172982691f815f1dfcd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:17:29 -0700 Subject: [PATCH 635/877] edit imports --- mavecore/models/identifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 31cae7e..73b2b92 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -1,6 +1,5 @@ from pydantic import BaseModel, ValidationError, validator, HttpUrl from typing import Optional -import idutils from mavecore.validation import identifier From e9df9af5645b91ab9f9a2ffe4f67e6c2148a9d97 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:17:44 -0700 Subject: [PATCH 636/877] edit error messages --- mavecore/validation/identifier.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index 9c44e47..0b7aa84 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -46,7 +46,8 @@ def validate_pubmed_identifier(identifier): If the identifier is not a valid PubMed identifier. """ if not idutils.is_pmid(identifier): - raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") + #raise ValidationError(f"'{identifier} is not a valid PubMed identifier.") + raise ValidationError("{} is not a valid PubMed identifier.".format(identifier)) def validate_doi_identifier(identifier): @@ -64,7 +65,8 @@ def validate_doi_identifier(identifier): If the identifier is not a valid DOI identifier. """ if not idutils.is_doi(identifier): - raise ValidationError(f"'{identifier}' is not a valid DOI.") + #raise ValidationError(f"'{identifier}' is not a valid DOI.") + raise ValidationError("{} is not a valid DOI identifier.".format(identifier)) def validate_ensembl_identifier(identifier): From 967a4fea8ed80750580ca80bd6ca3d09740ad53c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:17:58 -0700 Subject: [PATCH 637/877] rename file --- mavecore/validation/constants/{summary.py => keywords.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename mavecore/validation/constants/{summary.py => keywords.py} (100%) diff --git a/mavecore/validation/constants/summary.py b/mavecore/validation/constants/keywords.py similarity index 100% rename from mavecore/validation/constants/summary.py rename to mavecore/validation/constants/keywords.py From 8accfcd0e5b3f5e78656bcc90a1309bd59417cd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:18:19 -0700 Subject: [PATCH 638/877] add target constants and validation code --- mavecore/validation/constants/target.py | 2 ++ mavecore/validation/target.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 mavecore/validation/constants/target.py create mode 100644 mavecore/validation/target.py diff --git a/mavecore/validation/constants/target.py b/mavecore/validation/constants/target.py new file mode 100644 index 0000000..0e83872 --- /dev/null +++ b/mavecore/validation/constants/target.py @@ -0,0 +1,2 @@ +valid_categories = ["Protein coding", "Regulatory", "Other noncoding"] +valid_sequence_types = ["Infer", "DNA", "Protein"] \ No newline at end of file diff --git a/mavecore/validation/target.py b/mavecore/validation/target.py new file mode 100644 index 0000000..21039c9 --- /dev/null +++ b/mavecore/validation/target.py @@ -0,0 +1,14 @@ +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.target import valid_categories, valid_sequence_types + + +def validate_target_category(category): + if category not in valid_categories: + raise ValidationError("{}'s is not a valid target category. Valid categories are " + "Protein coding, Regulatory, and Other noncoding".format(category)) + + +def validate_sequence_category(sequence_type): + if sequence_type not in valid_sequence_types: + raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " + "Infer, DNA, and Protein".format(sequence_type)) From 15b20a9a51d8f6eca2f3a2cacb9fc458fa8444c3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:18:27 -0700 Subject: [PATCH 639/877] reformat file --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index db72ca6..c5fce5a 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -16,4 +16,4 @@ def is_null(value): True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. """ value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value \ No newline at end of file + return null_values_re.fullmatch(value) or not value From 850c15ad17a9d778818a514e3bb6ea0207f2cf70 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:25:46 -0700 Subject: [PATCH 640/877] edit imports --- tests/models/map.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/map.py b/tests/models/map.py index 6062b9d..a7c92bd 100644 --- a/tests/models/map.py +++ b/tests/models/map.py @@ -1,5 +1,4 @@ from unittest import TestCase -from pydantic import ValidationError from mavecore.models.map import ReferenceMap From 7a07ff2180230448911cf1948ed6f398b623b2e3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:27:31 -0700 Subject: [PATCH 641/877] add setUp methods to test classes --- tests/models/map.py | 14 +++++--------- tests/models/target.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/models/map.py b/tests/models/map.py index a7c92bd..9dd7418 100644 --- a/tests/models/map.py +++ b/tests/models/map.py @@ -3,18 +3,14 @@ class TestReferenceMap(TestCase): - def test_valid_all_fields(self): - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = { - "id": 0, + def setUp(self): + self.reference_map = { "genomeId": 0, "targetId": 0, - "isPrimary": True, - "genome": genome, - "creationDate": "2022-02-02", - "modificationDate": "2022-02-02", } - ReferenceMap.parse_obj(reference_map) + + def test_valid_all_fields(self): + ReferenceMap.parse_obj(self.reference_map) def test_valid_exclude_optional(self): genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} diff --git a/tests/models/target.py b/tests/models/target.py index 163f787..49c3515 100644 --- a/tests/models/target.py +++ b/tests/models/target.py @@ -4,6 +4,19 @@ class TestTargetGene(TestCase): + def setUp(self): + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "Protein", "sequence": "ATCG"} + self.target = { + "name": "name", + "category": "Protein coding", + "ensembleIdId": 0, + "refseqIdId": 0, + "uniprotIdId": 0, + "referenceMaps": [reference_map], + "wtSequence": sequence, + } + def test_valid_all_fields(self): genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} From 8a62511a51a1d79c6be722885e374d31073a38d5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:28:21 -0700 Subject: [PATCH 642/877] refactor test cases to work with setUp methods --- tests/models/target.py | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/tests/models/target.py b/tests/models/target.py index 49c3515..102ed99 100644 --- a/tests/models/target.py +++ b/tests/models/target.py @@ -18,37 +18,14 @@ def setUp(self): } def test_valid_all_fields(self): - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} - sequence = {"sequenceType": "type", "sequence": "ATCG"} - target = { - "name": "name", - "category": "Protein coding", - "referenceMaps": [reference_map], - "wtSequence": sequence, - } - TargetGene.parse_obj(target) + TargetGene.parse_obj(self.target) def test_invalid_category(self): - genome = {"shortName": "names", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} - sequence = {"sequenceType": "type", "sequence": "ATCG"} - target = { - "name": "name", - "category": "Protein", - "referenceMaps": [reference_map], - "wtSequence": sequence, - } + self.target["category"] = "Protein" with self.assertRaises(ValidationError): - TargetGene.parse_obj(target) + TargetGene.parse_obj(self.target) def test_invalid_missing_required_field(self): - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = {"id": 0, "genomeId": 0, "targetId": 0, "isPrimary": True, "genome": genome} - target = { - "name": "name", - "category": "Protein coding", - "referenceMaps": [reference_map], - } + self.target.pop("wtSequence") with self.assertRaises(ValidationError): - TargetGene.parse_obj(target) + TargetGene.parse_obj(self.target) From 8529a56acff6052067bfe62e9da6311d6b5e258f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:29:14 -0700 Subject: [PATCH 643/877] delete unneeded test cases --- tests/models/map.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/models/map.py b/tests/models/map.py index 9dd7418..33c1f60 100644 --- a/tests/models/map.py +++ b/tests/models/map.py @@ -11,27 +11,3 @@ def setUp(self): def test_valid_all_fields(self): ReferenceMap.parse_obj(self.reference_map) - - def test_valid_exclude_optional(self): - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = { - "id": 0, - "genomeId": 0, - "targetId": 0, - "isPrimary": True, - "genome": genome, - } - ReferenceMap.parse_obj(reference_map) - - def test_invalid_creation_date(self): - genome = {"shortName": "name", "organismName": "organism", "genomeId": 0, "id": 0} - reference_map = { - "id": 0, - "genomeId": 0, - "targetId": 0, - "isPrimary": True, - "genome": genome, - "creationDate": "2022-02-02-", - } - with self.assertRaises(ValidationError): - ReferenceMap.parse_obj(reference_map) From 0ab0f261126f6363e169292fefbfcbed8c5c456d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:33:05 -0700 Subject: [PATCH 644/877] add target test cases --- tests/validation/target.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/validation/target.py diff --git a/tests/validation/target.py b/tests/validation/target.py new file mode 100644 index 0000000..e69de29 From fc0832226dc590509ff6acc9f665c732ab54c310 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:33:09 -0700 Subject: [PATCH 645/877] add urn test cases --- tests/validation/urn.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/validation/urn.py diff --git a/tests/validation/urn.py b/tests/validation/urn.py new file mode 100644 index 0000000..e69de29 From 7eb1d117eb61bdb862208a79ea48862e96d405dd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:50:19 -0700 Subject: [PATCH 646/877] add urn test cases --- tests/validation/urn.py | 79 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index e69de29..9be0344 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -0,0 +1,79 @@ +from unittest import TestCase + +from mavecore.validation.urn import * + + +class TestValidateUrn(TestCase): + def test_valid_mavedb_urn(self): + validate_mavedb_urn("urn:mavedb:00000002-a-1") + + def test_invalid_mavedb_urn(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn("urn:mavedb:00000002-a-1-z") + + def test_valid_mavedb_urn_experimentset(self): + validate_mavedb_urn_experimentset("") + pass + + def test_invalid_mavedb_urn_experimentset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experimentset("") + + def test_valid_mavedb_urn_experiment(self): + validate_mavedb_urn_experiment("urn:mavedb:00000001-a") + + def test_invalid_mavedb_urn_experiment(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experiment("") + + def test_valid_mavedb_urn_scoreset(self): + validate_mavedb_urn_scoreset("urn:mavedb:00000001-a-1") + + def test_invalid_mavedb_urn_scoreset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_scoreset("") + + def test_valid_mavedb_urn_variant(self): + validate_mavedb_urn_variant("") + + def test_invalid_mavedb_urn_variant(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_variant("urn:mavedb:00000002-a-1") # this is a scoreset urn + + +class TestValidateTmpUrn(TestCase): + def test_valid_tmp_mavedb_urn(self): + validate_mavedb_urn("urn:mavedb:00000002-a-1") + + def test_invalid_tmp_mavedb_urn(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn("urn:mavedb:00000002-a-1-z") + + def test_valid_tmp_mavedb_urn_experimentset(self): + validate_mavedb_urn_experimentset("") + pass + + def test_invalid_tmp_mavedb_urn_experimentset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experimentset("") + + def test_valid_tmp_mavedb_urn_experiment(self): + validate_mavedb_urn_experiment("urn:mavedb:00000001-a") + + def test_invalid_tmp_mavedb_urn_experiment(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_experiment("") + + def test_valid_tmp_mavedb_urn_scoreset(self): + validate_mavedb_urn_scoreset("urn:mavedb:00000001-a-1") + + def test_invalid_tmp_mavedb_urn_scoreset(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_scoreset("") + + def test_valid_tmp_mavedb_urn_variant(self): + validate_mavedb_urn_variant("") + + def test_invalid_tmp_mavedb_urn_variant(self): + with self.assertRaises(ValidationError): + validate_mavedb_urn_variant("urn:mavedb:00000002-a-1") # this is a scoreset urn From cec5bee596b5aa84959e75b3eb177d4bddf6b7db Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 09:50:29 -0700 Subject: [PATCH 647/877] edit imports --- tests/validation/variant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/validation/variant.py b/tests/validation/variant.py index ea35e6d..1b258d8 100644 --- a/tests/validation/variant.py +++ b/tests/validation/variant.py @@ -1,4 +1,3 @@ -# from core.utilities import null_values_list from unittest import TestCase from mavecore.validation.variant import validate_hgvs_string From 3d24cc3762f48c3a09603db2e28adce9e4cfd0f1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 11:49:05 -0700 Subject: [PATCH 648/877] delete files --- tests/validation/test_dataset_validators.py | 271 -------- tests/validation/test_validators.py | 650 -------------------- 2 files changed, 921 deletions(-) delete mode 100644 tests/validation/test_dataset_validators.py delete mode 100644 tests/validation/test_validators.py diff --git a/tests/validation/test_dataset_validators.py b/tests/validation/test_dataset_validators.py deleted file mode 100644 index 66d3691..0000000 --- a/tests/validation/test_dataset_validators.py +++ /dev/null @@ -1,271 +0,0 @@ -from io import BytesIO, StringIO -from unittest import TestCase - - -import pandas as pd - -from mavecore.validation import constants - -from mavecore.validation.dataset_validators import ( - validate_scoreset_count_data_input, - validate_scoreset_score_data_input, - validate_at_least_one_additional_column, - validate_has_hgvs_in_header, - validate_header_contains_no_null_columns, - read_header_from_io, - validate_scoreset_json, - validate_datasets_define_same_variants, -) - - -class TestNoNullInColumnsValidator(TestCase): - """ - Tests to ensure that an input file contains no null values in the header - such as '', None, null etc. - """ - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header) - - def test_does_not_raise_valuerror_when_non_null_values_in_column( - self, - ): - file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header) # Should pass - - -class TestAtLeastOneNumericColumnValidator(TestCase): - """ - Tests to ensure that an input file contains at least two columns. - """ - - def test_raises_valuerror_when_less_than_2_values_in_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) - - def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass - - file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass - - -class TestHgvsInHeaderValidator(TestCase): - """ - Tests that case-sensitive 'hgvs' is in the header of a file. - """ - - def test_raises_valuerror_when_neither_hgvs_col_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) - - def test_hgvs_must_be_lowercase(self): - file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) - - def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass - - file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass - - -class TestValidateScoreCountsDefineSameVariants(TestCase): - """ - Tests that an uploaded score/counts files define the same variants - in both the _nt column and _pro column. - """ - - def test_ve_counts_defines_different_nt_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_ve_counts_defines_different_splice_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_ve_counts_defines_different_pro_variants(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu5Glu"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu75Glu"], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts) - - def test_passes_when_same_variants_defined(self): - scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - validate_datasets_define_same_variants(scores, counts) - - -class TestValidateScoreSetCountDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted count data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - def test_raises_valuerror_no_numeric_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file) - - -class TestValidateScoreSetScoreDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted score data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_raises_valuerror_no_numeric_column(self): - file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_raises_valuerror_when_null_values_in_column(self): - for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - def test_validatation_error_score_not_in_header(self): - file = BytesIO("{},count\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file) - - -class TestValidateScoreSetJsonValidator(TestCase): - """ - Test to ensure that a scoreset json field is properly formatted. - """ - - def test_valueerror_unexptected_columns(self): - field = { - "extra_column": [], - constants.score_columns: ["score"], - constants.count_columns: [], - } - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_values_not_lists(self): - field = {constants.score_columns: ["score"], constants.count_columns: {}} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_list_values_not_strings(self): - field = {constants.score_columns: [b"score"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_empty_score_columns(self): - field = {constants.score_columns: [], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_missing_dict_columns(self): - # constants.score_columns missing - field = {constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - # constants.count_columns missing - field = {constants.score_columns: ["score"]} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - def test_valueerror_missing_header_columns(self): - # constants.score_columns columns missing 'score' - field = {constants.score_columns: ["hgvs"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) diff --git a/tests/validation/test_validators.py b/tests/validation/test_validators.py deleted file mode 100644 index 389d0ff..0000000 --- a/tests/validation/test_validators.py +++ /dev/null @@ -1,650 +0,0 @@ -from io import StringIO -import unittest -from unittest import TestCase -from random import choice - -import pandas as pd -from pandas.testing import assert_index_equal - -# from dataset import constants -from mavecore.validation import constants -from mavecore.validation.exceptions import ValidationError - -from mavecore.validation.variant_validators import ( - MaveDataset, - validate_variant_json, - validate_hgvs_string, -) - - -def generate_hgvs(prefix: str = "c") -> str: - """Generates a random hgvs string from a small sample.""" - if prefix == "p": - # Subset of 3-letter codes, chosen at random. - amino_acids = [ - "Ala", - "Leu", - "Gly", - "Val", - "Tyr", - "Met", - "Cys", - "His", - "Glu", - "Phe", - ] - ref = choice(amino_acids) - alt = choice(amino_acids) - return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" - else: - alt = choice("ATCG") - ref = choice("ATCG") - return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" - - - -class TestMaveDataset(TestCase): - """ - Tests the validator :func:`validate_variant_rows` to check if the correct - errors are thrown when invalid rows are encountered in a - scores/counts/meta data input file. Checks for: - - Invalid HGVS string in a row - - Row HGVS is defined in more than one row - - Row values are not int/float for a count/score file - - Tests also check to see if the correct header and hgvs data information - is parsed and returned. - """ - - SCORE_COL = constants.required_score_column - HGVS_NT_COL = constants.hgvs_nt_column - HGVS_SPLICE_COL = constants.hgvs_splice_column - HGVS_PRO_COL = constants.hgvs_pro_column - - @staticmethod - def mock_return_value(data, index=None): - df = pd.read_csv(StringIO(data), sep=",", na_values=["None", None]) - if index: - df.index = pd.Index(df[index]) - return df - - def test_invalid_row_hgvs_is_not_a_string(self): - data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_hgvs_columns(self): - data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_replaces_null_with_none_in_secondary_hgvs_column(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{},{}\n{},{},1.0 ".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.HGVS_PRO_COL]), [None] - ) - - def test_replaces_null_with_none_in_numeric_columns(self): - hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.SCORE_COL]), [None] - ) - - def test_invalid_null_values_in_header(self): - for value in constants.null_values_list: - with self.subTest(msg=f"'{value}'"): - data = "{},{},{}\n{},1.0,1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs() - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_no_additional_columns_outside_hgvs_ones(self): - data = "{},{},{}\n{},{},{}".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_scores_missing_scores_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_missing_either_required_hgvs_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_empty_no_variants_parsed(self): - data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_empty) - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_non_numeric_values_in_score_column(self): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - "I am not a number", - ) - - with self.assertRaises(ValueError): - MaveDataset.for_scores(StringIO(data)) - - def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): - hgvs = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_data_method_converts_null_values_to_None(self): - hgvs = generate_hgvs() - for value in constants.null_values_list: - with self.subTest(msg=value): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, value - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - - df = dataset.data(serializable=True) - self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) - self.assertIsNone(df[self.SCORE_COL].values[0]) - - def test_sorts_header(self): - hgvs_nt = generate_hgvs(prefix="g") - hgvs_pro = generate_hgvs(prefix="p") - hgvs_splice = generate_hgvs(prefix="c") - data = "{},{},{},{},{}\n{},{},{},{},{}".format( - self.HGVS_PRO_COL, - self.HGVS_NT_COL, - "colA", - self.SCORE_COL, - self.HGVS_SPLICE_COL, - hgvs_pro, - hgvs_nt, - "hello", - 1.0, - hgvs_splice, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - dataset.columns, - [ - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - "colA", - ], - ) - - def test_does_not_allow_wt_and_sy(self): - wt = "_wt" - sy = "_sy" - data = "{},{},{},{}\n{},{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - wt, - wt, - sy, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 3) - print(dataset.errors) - - def test_parses_numeric_column_values_into_float(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - value = dataset.data()[self.SCORE_COL].values[0] - self.assertIsInstance(value, float) - - def test_does_not_split_double_quoted_variants(self): - hgvs = "c.[123A>G;124A>G]" - data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) - - # def test_invalid_non_double_quoted_multi_variant_row(self): - # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) - # data = "{},{}\n'{}',1.0".format( - # constants.hgvs_nt_column, required_score_column, hgvs - # ) - # with self.assertRaises(ValidationError): - # _ = validate_variant_rows(BytesIO(data.encode())) - - def test_primary_column_is_pro_when_nt_is_not_defined(self): - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_PRO_COL) - - def test_primary_column_is_nt_by_default(self): - hgvs_nt = generate_hgvs(prefix="c") - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, hgvs_pro - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_NT_COL) - - def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = ( - "{},{},{}\n" - "{},{},1.0\n" - "{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - v, - generate_hgvs(prefix="p"), - ) - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_error_missing_value_in_pro_column_when_pro_is_primary(self): - for v in constants.null_values_list: - with self.subTest(msg=v): - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_df_indexed_by_primary_column(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - assert_index_equal(dataset.data().index, dataset.index) - - def test_invalid_duplicates_in_index(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - hgvs, - generate_hgvs(prefix="p"), - hgvs, - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_hgvs_in_column(self): - tests = [ - (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), - (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), - (self.HGVS_NT_COL, generate_hgvs(prefix="p")), - ] - for (column, variant) in tests: - with self.subTest(msg=f"{column}: {variant}"): - if column == self.HGVS_SPLICE_COL: - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - column, - self.SCORE_COL, - generate_hgvs(prefix="g"), - variant, - ) - else: - data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): - data = "{},{}\n{},1.0\n{},2.0".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_nt_not_genomic_when_splice_present(self): - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_defined_when_nt_is_not(self): - data = "{},{},{}\n,{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors) - - def test_invalid_splice_not_defined_when_nt_is_genomic(self): - data = "{},{}\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g") - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors) - - def test_invalid_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 0) - - def test_invalid_close_to_zero_is_not_parsed_as_none(self): - hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15) - - def test_defines_same_variants(self): - tests = [ - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - True, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), - False, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - True, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - False, - ), - # Check returns None if either dataset invalid - ( - "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - None, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "wrong_column,count\nc.1A>G,0.0".format(), - None, - ), - ] - - for (scores, counts, expected) in tests: - with self.subTest(msg=(scores, counts, expected)): - scores_dataset = MaveDataset.for_scores(StringIO(scores)) - scores_dataset.validate() - - counts_dataset = MaveDataset.for_counts(StringIO(counts)) - counts_dataset.validate() - - self.assertEqual(scores_dataset.match_other(counts_dataset), expected) - - def test_to_dict(self): - hgvs_1 = generate_hgvs(prefix="c") - hgvs_2 = generate_hgvs(prefix="c") - data = "{},{},{},{}\n{},,,\n{},,,1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - hgvs_1, - hgvs_2, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertDictEqual( - dataset.to_dict(), - { - hgvs_1: { - self.HGVS_NT_COL: hgvs_1, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: None, - }, - hgvs_2: { - self.HGVS_NT_COL: hgvs_2, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: 1.0, - }, - }, - ) - - def test_valid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertTrue(dataset.is_valid) - - def test_invalid_targetseq_validation_fails(self): - data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("p.Val1Phe", dataset.errors[0]) - - def test_invalid_target_sequence_not_a_multiple_of_3(self): - data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATCG") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("multiple of 3", dataset.errors[0]) - - @unittest.expectedFailure - def test_invalid_relaxed_ordering_check_fails(self): - self.fail("Test is pending") From 3a6fda2ab4c996331d04b0b6b0eee07b1cf13911 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 11:49:42 -0700 Subject: [PATCH 649/877] add note to file --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 536c91a..2c8ae3a 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -94,6 +94,7 @@ def validate_column_names(columns): raise ValidationError("There must be at least one additional column beyond the hgvs columns.") # validate against UTF-8byte ordering marks # TODO if dataframe is a scores df make sure it has a score column + # also make sure counts df has a counts column def validate_variants(variants, column_name=None): From aaa456481c33b896b564556e4ed941c20bb2203b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 11:50:25 -0700 Subject: [PATCH 650/877] add TODO to resolve issues with tmp urn --- tests/validation/urn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index 9be0344..0bb122c 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -42,8 +42,9 @@ def test_invalid_mavedb_urn_variant(self): class TestValidateTmpUrn(TestCase): + # TODO consider the way we are making the tmp urn strings def test_valid_tmp_mavedb_urn(self): - validate_mavedb_urn("urn:mavedb:00000002-a-1") + validate_mavedb_urn("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") def test_invalid_tmp_mavedb_urn(self): with self.assertRaises(ValidationError): @@ -65,7 +66,7 @@ def test_invalid_tmp_mavedb_urn_experiment(self): validate_mavedb_urn_experiment("") def test_valid_tmp_mavedb_urn_scoreset(self): - validate_mavedb_urn_scoreset("urn:mavedb:00000001-a-1") + validate_mavedb_urn_scoreset("tmp:a56b8eb08e190490") def test_invalid_tmp_mavedb_urn_scoreset(self): with self.assertRaises(ValidationError): From 602841aefc2cc845bbb4229aa3ff67ed12eaeb74 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:22:07 -0700 Subject: [PATCH 651/877] refactor validate dataframe function --- mavecore/validation/dataframe.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 2c8ae3a..7737cb4 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -24,14 +24,12 @@ def validate_dataframes(scores=None, counts=None): If any of the validation fails. """ validate_no_null_columns_or_rows(scores) - hgvs_columns = validate_column_names(scores.columns) - for column in hgvs_columns: - validate_variants(scores[column]) + validate_column_names(scores.columns) + validate_values_by_column(scores) if counts is not None: validate_no_null_columns_or_rows(counts) - hgvs_columns = validate_column_names(counts.columns) - for column in hgvs_columns: - validate_variants(counts[column]) + validate_column_names(counts.columns) + validate_values_by_column(counts) validate_dataframes_define_same_variants(scores, counts) From 158d6769e72ca45b73f53c909f48a34c035c1d7a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:22:49 -0700 Subject: [PATCH 652/877] replace and reimplement validate_variants with validate_values_by_column --- mavecore/validation/dataframe.py | 60 ++++++++++++++------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 7737cb4..24bf806 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -95,40 +95,34 @@ def validate_column_names(columns): # also make sure counts df has a counts column -def validate_variants(variants, column_name=None): +def validate_values_by_column(dataset): """ - Validates a string of variants and verifies that the variant type in the column name makes - sense with regards to the actual variants. - - Parameters - __________ - variants: list[str] - List of mavehgvs formatted strings. - column_name: str - The hgvs column name from which the variants parameter originates. - - Raises - ______ - ValidationError - If any variant in the list of variants does not adhere to the mavehgvs specifications. - """ - # variant strings will be cast into hgvs variant objects to validate - for variant in variants: - if column_name == "hgvs_nt": - column = "nt" - elif column_name == "hgvs_pro": - column = "p" - elif column_name == "hgvs_splice": - column = "splice" - validate_hgvs_string(variant, column=column) - '''try: - v = Variant(variant) - # variants should align with the hgvs column names - # check this by seeing if the prefix makes sense with regards to the hgvs column name - validate_variant_matches_hgvs_column_name(column_name, v.prefix) - except ValidationError: - raise ValidationError(variant + " does not adhere to mavehgvs variant guidelines.")''' - + Validates a string of variants and verifies that the variant type in the column name makes + sense with regards to the actual variants. + + Parameters + __________ + variants: list[str] + List of mavehgvs formatted strings. + column_name: str + The hgvs column name from which the variants parameter originates. + + Raises + ______ + ValidationError + If any variant in the list of variants does not adhere to the mavehgvs specifications. + """ + for column in dataset.columns: + if column == hgvs_nt_column: + dataset[[hgvs_nt_column]].apply(validate_hgvs_string(column="nt")) + elif column == hgvs_pro_column: + dataset[[hgvs_pro_column]].apply(validate_hgvs_string(column="p")) + elif column == hgvs_splice_column: + dataset[[hgvs_splice_column]].apply(validate_hgvs_string(column="splice")) + elif column == required_score_column: + dataset[[required_score_column]].apply(validate_score()) + else: + pass def validate_variant_matches_hgvs_column_name(variant, column_name): """ From e2adddf2d207fe2ec8d092d45990f9aa14a0e186 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:23:21 -0700 Subject: [PATCH 653/877] delete unused function and declare validate score function --- mavecore/validation/dataframe.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 24bf806..4e70bd5 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -124,22 +124,8 @@ def validate_values_by_column(dataset): else: pass -def validate_variant_matches_hgvs_column_name(variant, column_name): - """ - Checks that a variant makes sense with regards to the hgvs column name. - - Parameters - __________ - variants: list[str] - List of mavehgvs formatted strings. - column_name: str - The hgvs column name from which the variants parameter originates. - Raises - ______ - ValidationError - If the variant does not make sense with regards to the hgvs column name. - """ +def validate_score(score): pass From 4ad50f035c8693aaab2528d2d5b3ac363d0f2456 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:24:25 -0700 Subject: [PATCH 654/877] outline unittests for dataframe --- tests/validation/dataframe.py | 931 +++++++++++++++++++++++++++++++++- 1 file changed, 929 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index f4cebba..2fce730 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -3,6 +3,29 @@ from mavecore.validation.dataframe import * +""" +from io import BytesIO, StringIO +from unittest import TestCase + + +import pandas as pd + +from mavecore.validation import constants + +from mavecore.validation.dataset_validators import ( + validate_scoreset_count_data_input, + validate_scoreset_score_data_input, + validate_at_least_one_additional_column, + validate_has_hgvs_in_header, + validate_header_contains_no_null_columns, + read_header_from_io, + validate_scoreset_json, + validate_datasets_define_same_variants, +) + + +""" + class TestValidateNoNullColumnsOrRows(TestCase): def setUp(self): @@ -74,7 +97,7 @@ def setUp(self): ) def test_valid_variants(self): - validate_variants(self.dataframe["hgvs_nt"]) + pass #validate_variants(self.dataframe[hgvs_nt_column], hgvs_nt_column) def test_invalid_variants(self): pass @@ -126,4 +149,908 @@ def test_counts_defines_different_splice_variants(self): def test_counts_defines_different_pro_variants(self): self.counts[hgvs_pro_column][0] = "p.Leu75Glu" with self.assertRaises(ValidationError): - validate_dataframes_define_same_variants(self.scores, self.counts) \ No newline at end of file + validate_dataframes_define_same_variants(self.scores, self.counts) + + +class TestNoNullInColumnsValidator(TestCase): + """ + Tests to ensure that an input file contains no null values in the header + such as '', None, null etc. + """ + + def test_raises_valuerror_when_null_values_in_column(self): + '''for value in constants.null_values_list: + file = BytesIO( + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + ) + with self.assertRaises(ValueError): + header = read_header_from_io(file) + validate_header_contains_no_null_columns(header)''' + + def test_does_not_raise_valuerror_when_non_null_values_in_column( + self, + ): + '''file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) + header = read_header_from_io(file) + validate_header_contains_no_null_columns(header) # Should pass''' + + +class TestAtLeastOneNumericColumnValidator(TestCase): + """ + Tests to ensure that an input file contains at least two columns. + """ + + def test_raises_valuerror_when_less_than_2_values_in_column(self): + '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) + with self.assertRaises(ValueError): + header = read_header_from_io(file) + validate_at_least_one_additional_column(header)''' + + def test_does_not_raise_valuerror_2_or_more_values_in_column(self): + '''file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + header = read_header_from_io(file) + validate_at_least_one_additional_column(header) # Should pass + + file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) + header = read_header_from_io(file) + validate_at_least_one_additional_column(header) # Should pass''' + + +class TestHgvsInHeaderValidator(TestCase): + """ + Tests that case-sensitive 'hgvs' is in the header of a file. + """ + + def test_raises_valuerror_when_neither_hgvs_col_in_column(self): + '''file = BytesIO("score,count\n".encode()) + with self.assertRaises(ValueError): + header = read_header_from_io(file) + validate_has_hgvs_in_header(header)''' + + def test_hgvs_must_be_lowercase(self): + '''file = BytesIO( + "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() + ) + with self.assertRaises(ValueError): + header = read_header_from_io(file) + validate_has_hgvs_in_header(header)''' + + def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): + '''file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) + header = read_header_from_io(file) + validate_has_hgvs_in_header(header) # Should pass + + file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) + header = read_header_from_io(file) + validate_has_hgvs_in_header(header) # Should pass''' + + +class TestValidateScoreCountsDefineSameVariants(TestCase): + """ + Tests that an uploaded score/counts files define the same variants + in both the _nt column and _pro column. + """ + + def test_ve_counts_defines_different_nt_variants(self): + '''scores = pd.DataFrame( + { + constants.hgvs_nt_column: ["c.1A>G"], + constants.hgvs_pro_column: [None], + constants.hgvs_splice_column: [None], + } + ) + counts = pd.DataFrame( + { + constants.hgvs_nt_column: ["c.2A>G"], + constants.hgvs_pro_column: [None], + constants.hgvs_splice_column: [None], + } + ) + with self.assertRaises(ValueError): + validate_datasets_define_same_variants(scores, counts)''' + + def test_ve_counts_defines_different_splice_variants(self): + '''scores = pd.DataFrame( + { + constants.hgvs_nt_column: [None], + constants.hgvs_splice_column: ["c.1A>G"], + constants.hgvs_pro_column: [None], + } + ) + counts = pd.DataFrame( + { + constants.hgvs_nt_column: [None], + constants.hgvs_splice_column: ["c.2A>G"], + constants.hgvs_pro_column: [None], + } + ) + with self.assertRaises(ValueError): + validate_datasets_define_same_variants(scores, counts)''' + + def test_ve_counts_defines_different_pro_variants(self): + '''scores = pd.DataFrame( + { + constants.hgvs_nt_column: [None], + constants.hgvs_splice_column: [None], + constants.hgvs_pro_column: ["p.Leu5Glu"], + } + ) + counts = pd.DataFrame( + { + constants.hgvs_nt_column: [None], + constants.hgvs_splice_column: [None], + constants.hgvs_pro_column: ["p.Leu75Glu"], + } + ) + with self.assertRaises(ValueError): + validate_datasets_define_same_variants(scores, counts)''' + + def test_passes_when_same_variants_defined(self): + '''scores = pd.DataFrame( + { + constants.hgvs_nt_column: ["c.1A>G"], + constants.hgvs_pro_column: ["p.Leu5Glu"], + constants.hgvs_splice_column: ["c.1A>G"], + } + ) + counts = pd.DataFrame( + { + constants.hgvs_nt_column: ["c.1A>G"], + constants.hgvs_pro_column: ["p.Leu5Glu"], + constants.hgvs_splice_column: ["c.1A>G"], + } + ) + validate_datasets_define_same_variants(scores, counts)''' + + +class TestValidateScoreSetCountDataInputValidator(TestCase): + """ + Tests that validation errors are thrown when an ill-formatted count data + input file is supplied. + """ + + def test_raises_valuerror_when_hgvs_not_in_column(self): + '''file = BytesIO("score,count\n".encode()) + with self.assertRaises(ValueError): + validate_scoreset_count_data_input(file)''' + + def test_raises_valuerror_no_numeric_column(self): + '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) + with self.assertRaises(ValueError): + validate_scoreset_count_data_input(file)''' + + def test_raises_valuerror_when_null_values_in_column(self): + '''for value in constants.null_values_list: + file = BytesIO( + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + ) + with self.assertRaises(ValueError): + validate_scoreset_count_data_input(file)''' + + +class TestValidateScoreSetScoreDataInputValidator(TestCase): + """ + Tests that validation errors are thrown when an ill-formatted score data + input file is supplied. + """ + + def test_raises_valuerror_when_hgvs_not_in_column(self): + '''file = BytesIO("score,count\n".encode()) + with self.assertRaises(ValueError): + validate_scoreset_score_data_input(file)''' + + def test_raises_valuerror_no_numeric_column(self): + '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) + with self.assertRaises(ValueError): + validate_scoreset_score_data_input(file)''' + + def test_raises_valuerror_when_null_values_in_column(self): + '''for value in constants.null_values_list: + file = BytesIO( + "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() + ) + with self.assertRaises(ValueError): + validate_scoreset_score_data_input(file)''' + + def test_validatation_error_score_not_in_header(self): + '''file = BytesIO("{},count\n".format(constants.hgvs_nt_column).encode()) + with self.assertRaises(ValueError): + validate_scoreset_score_data_input(file)''' + + +class TestValidateScoreSetJsonValidator(TestCase): + """ + Test to ensure that a scoreset json field is properly formatted. + """ + + def test_valueerror_unexptected_columns(self): + '''field = { + "extra_column": [], + constants.score_columns: ["score"], + constants.count_columns: [], + } + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + def test_valueerror_values_not_lists(self): + '''field = {constants.score_columns: ["score"], constants.count_columns: {}} + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + def test_valueerror_list_values_not_strings(self): + '''field = {constants.score_columns: [b"score"], constants.count_columns: []} + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + def test_valueerror_empty_score_columns(self): + '''field = {constants.score_columns: [], constants.count_columns: []} + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + def test_valueerror_missing_dict_columns(self): + '''# constants.score_columns missing + field = {constants.count_columns: []} + with self.assertRaises(ValueError): + validate_scoreset_json(field) + + # constants.count_columns missing + field = {constants.score_columns: ["score"]} + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + def test_valueerror_missing_header_columns(self): + '''# constants.score_columns columns missing 'score' + field = {constants.score_columns: ["hgvs"], constants.count_columns: []} + with self.assertRaises(ValueError): + validate_scoreset_json(field)''' + + +""" +from io import StringIO +import unittest +from unittest import TestCase +from random import choice + +import pandas as pd +from pandas.testing import assert_index_equal + +# from dataset import constants +from mavecore.validation import constants +from mavecore.validation.exceptions import ValidationError + +from mavecore.validation.variant_validators import ( + MaveDataset, +) +""" + +def generate_hgvs(prefix: str = "c") -> str: + """'''Generates a random hgvs string from a small sample.'''""" + '''if prefix == "p": + # Subset of 3-letter codes, chosen at random. + amino_acids = [ + "Ala", + "Leu", + "Gly", + "Val", + "Tyr", + "Met", + "Cys", + "His", + "Glu", + "Phe", + ] + ref = choice(amino_acids) + alt = choice(amino_acids) + return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" + else: + alt = choice("ATCG") + ref = choice("ATCG") + return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}"''' + + + +class TestMaveDataset(TestCase): + """ + Tests the validator :func:`validate_variant_rows` to check if the correct + errors are thrown when invalid rows are encountered in a + scores/counts/meta data input file. Checks for: + - Invalid HGVS string in a row + - Row HGVS is defined in more than one row + - Row values are not int/float for a count/score file + + Tests also check to see if the correct header and hgvs data information + is parsed and returned. + """ + + '''SCORE_COL = constants.required_score_column + HGVS_NT_COL = constants.hgvs_nt_column + HGVS_SPLICE_COL = constants.hgvs_splice_column + HGVS_PRO_COL = constants.hgvs_pro_column''' + + @staticmethod + def mock_return_value(data, index=None): + '''df = pd.read_csv(StringIO(data), sep=",", na_values=["None", None]) + if index: + df.index = pd.Index(df[index]) + return df''' + + def test_invalid_row_hgvs_is_not_a_string(self): + '''data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_missing_hgvs_columns(self): + '''data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_replaces_null_with_none_in_secondary_hgvs_column(self): + '''hgvs_nt = generate_hgvs(prefix="c") + for c in constants.null_values_list: + with self.subTest(msg=f"'{c}'"): + data = "{},{},{}\n{},{},1.0 ".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, c + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertListEqual( + list(dataset.data(serializable=True)[self.HGVS_PRO_COL]), [None] + )''' + + def test_replaces_null_with_none_in_numeric_columns(self): + '''hgvs_nt = generate_hgvs(prefix="c") + for c in constants.null_values_list: + with self.subTest(msg=f"'{c}'"): + data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertListEqual( + list(dataset.data(serializable=True)[self.SCORE_COL]), [None] + )''' + + def test_invalid_null_values_in_header(self): + '''for value in constants.null_values_list: + with self.subTest(msg=f"'{value}'"): + data = "{},{},{}\n{},1.0,1.0".format( + self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs() + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_no_additional_columns_outside_hgvs_ones(self): + '''data = "{},{},{}\n{},{},{}".format( + self.HGVS_NT_COL, + self.HGVS_SPLICE_COL, + self.HGVS_PRO_COL, + generate_hgvs(prefix="g"), + generate_hgvs(prefix="c"), + generate_hgvs(prefix="p"), + ) + + dataset = MaveDataset.for_counts(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_scores_missing_scores_column(self): + '''data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0 + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_missing_either_required_hgvs_column(self): + '''data = "{},{}\n{},{}".format( + self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0 + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_empty_no_variants_parsed(self): + '''data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_empty) + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_error_non_numeric_values_in_score_column(self): + '''data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + "I am not a number", + ) + + with self.assertRaises(ValueError): + MaveDataset.for_scores(StringIO(data))''' + + def test_invalid_same_hgvs_nt_defined_in_two_rows(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},1.0\n{},1.0".format( + self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): + '''hgvs = generate_hgvs(prefix="p") + data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) + + dataset = MaveDataset.for_counts(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_data_method_converts_null_values_to_None(self): + '''hgvs = generate_hgvs() + for value in constants.null_values_list: + with self.subTest(msg=value): + data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, self.SCORE_COL, hgvs, value + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + + df = dataset.data(serializable=True) + self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) + self.assertIsNone(df[self.SCORE_COL].values[0])''' + + def test_sorts_header(self): + '''hgvs_nt = generate_hgvs(prefix="g") + hgvs_pro = generate_hgvs(prefix="p") + hgvs_splice = generate_hgvs(prefix="c") + data = "{},{},{},{},{}\n{},{},{},{},{}".format( + self.HGVS_PRO_COL, + self.HGVS_NT_COL, + "colA", + self.SCORE_COL, + self.HGVS_SPLICE_COL, + hgvs_pro, + hgvs_nt, + "hello", + 1.0, + hgvs_splice, + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertListEqual( + dataset.columns, + [ + self.HGVS_NT_COL, + self.HGVS_SPLICE_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + "colA", + ], + )''' + + def test_does_not_allow_wt_and_sy(self): + '''wt = "_wt" + sy = "_sy" + data = "{},{},{},{}\n{},{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_SPLICE_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + wt, + wt, + sy, + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 3) + print(dataset.errors)''' + + def test_parses_numeric_column_values_into_float(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + value = dataset.data()[self.SCORE_COL].values[0] + self.assertIsInstance(value, float)''' + + def test_does_not_split_double_quoted_variants(self): + '''hgvs = "c.[123A>G;124A>G]" + data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) + + # def test_invalid_non_double_quoted_multi_variant_row(self): + # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) + # data = "{},{}\n'{}',1.0".format( + # constants.hgvs_nt_column, required_score_column, hgvs + # ) + # with self.assertRaises(ValidationError): + # _ = validate_variant_rows(BytesIO(data.encode()))''' + + def test_primary_column_is_pro_when_nt_is_not_defined(self): + '''hgvs_pro = generate_hgvs(prefix="p") + data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertEqual(dataset.index_column, self.HGVS_PRO_COL)''' + + def test_primary_column_is_nt_by_default(self): + '''hgvs_nt = generate_hgvs(prefix="c") + hgvs_pro = generate_hgvs(prefix="p") + data = "{},{},{}\n{},{},1.0".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, hgvs_pro + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertEqual(dataset.index_column, self.HGVS_NT_COL)''' + + def test_error_missing_value_in_nt_column_when_nt_is_primary(self): + '''for v in constants.null_values_list: + with self.subTest(msg=v): + data = ( + "{},{},{}\n" + "{},{},1.0\n" + "{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + generate_hgvs(prefix="p"), + v, + generate_hgvs(prefix="p"), + ) + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_error_missing_value_in_pro_column_when_pro_is_primary(self): + '''for v in constants.null_values_list: + with self.subTest(msg=v): + data = "{},{}\n{},1.0\n{},1.0".format( + self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_df_indexed_by_primary_column(self): + '''data = "{},{},{}\n{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + generate_hgvs(prefix="p"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + assert_index_equal(dataset.data().index, dataset.index)''' + + def test_invalid_duplicates_in_index(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + hgvs, + generate_hgvs(prefix="p"), + hgvs, + generate_hgvs(prefix="p"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_hgvs_in_column(self): + '''tests = [ + (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), + (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), + (self.HGVS_NT_COL, generate_hgvs(prefix="p")), + ] + for (column, variant) in tests: + with self.subTest(msg=f"{column}: {variant}"): + if column == self.HGVS_SPLICE_COL: + data = "{},{},{}\n{},{},1.0".format( + self.HGVS_NT_COL, + column, + self.SCORE_COL, + generate_hgvs(prefix="g"), + variant, + ) + else: + data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): + '''data = "{},{}\n{},1.0\n{},2.0".format( + self.HGVS_NT_COL, + self.SCORE_COL, + generate_hgvs(prefix="g"), + generate_hgvs(prefix="c"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 2) + print(dataset.errors)''' + + def test_invalid_nt_not_genomic_when_splice_present(self): + '''data = "{},{},{}\n{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_SPLICE_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + generate_hgvs(prefix="c"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_splice_defined_when_nt_is_not(self): + '''data = "{},{},{}\n,{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_SPLICE_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_invalid_splice_not_defined_when_nt_is_genomic(self): + '''data = "{},{}\n{},1.0".format( + self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g") + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 2) + print(dataset.errors)''' + + def test_invalid_zero_is_not_parsed_as_none(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + df = dataset.data() + self.assertEqual(df[self.SCORE_COL].values[0], 0)''' + + def test_invalid_close_to_zero_is_not_parsed_as_none(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + df = dataset.data() + self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' + + def test_defines_same_variants(self): + '''tests = [ + ( + "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), + "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), + True, + ), + ( + "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), + "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), + False, + ), + ( + "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL + ), + "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL + ), + True, + ), + ( + "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL + ), + "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL + ), + False, + ), + # Check returns None if either dataset invalid + ( + "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), + "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), + None, + ), + ( + "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), + "wrong_column,count\nc.1A>G,0.0".format(), + None, + ), + ] + + for (scores, counts, expected) in tests: + with self.subTest(msg=(scores, counts, expected)): + scores_dataset = MaveDataset.for_scores(StringIO(scores)) + scores_dataset.validate() + + counts_dataset = MaveDataset.for_counts(StringIO(counts)) + counts_dataset.validate() + + self.assertEqual(scores_dataset.match_other(counts_dataset), expected)''' + + def test_to_dict(self): + '''hgvs_1 = generate_hgvs(prefix="c") + hgvs_2 = generate_hgvs(prefix="c") + data = "{},{},{},{}\n{},,,\n{},,,1.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.HGVS_SPLICE_COL, + self.SCORE_COL, + hgvs_1, + hgvs_2, + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertDictEqual( + dataset.to_dict(), + { + hgvs_1: { + self.HGVS_NT_COL: hgvs_1, + self.HGVS_SPLICE_COL: None, + self.HGVS_PRO_COL: None, + self.SCORE_COL: None, + }, + hgvs_2: { + self.HGVS_NT_COL: hgvs_2, + self.HGVS_SPLICE_COL: None, + self.HGVS_PRO_COL: None, + self.SCORE_COL: 1.0, + }, + }, + )''' + + def test_valid_targetseq_validation_fails(self): + '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate(targetseq="ATC") + + self.assertTrue(dataset.is_valid)''' + + def test_invalid_targetseq_validation_fails(self): + '''data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate(targetseq="ATC") + + self.assertFalse(dataset.is_valid) + print(dataset.errors) + + self.assertEqual(dataset.n_errors, 1) + self.assertIn("p.Val1Phe", dataset.errors[0])''' + + def test_invalid_target_sequence_not_a_multiple_of_3(self): + '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( + self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate(targetseq="ATCG") + + self.assertFalse(dataset.is_valid) + print(dataset.errors) + + self.assertEqual(dataset.n_errors, 1) + self.assertIn("multiple of 3", dataset.errors[0])''' + + #@unittest.expectedFailure + def test_invalid_relaxed_ordering_check_fails(self): + '''self.fail("Test is pending")''' From ef2d554e778edd489734a75ba69ad1e9dd8075a6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:39:44 -0700 Subject: [PATCH 655/877] delete notes and reformat file --- mavecore/validation/keywords.py | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/mavecore/validation/keywords.py b/mavecore/validation/keywords.py index 8e56254..9d83abc 100644 --- a/mavecore/validation/keywords.py +++ b/mavecore/validation/keywords.py @@ -8,9 +8,7 @@ def validate_keywords(v): else: for keyword in v: validate_keyword(keyword) - '''if is_null(keyword) or not isinstance(keyword, str): - raise ValidationError("{} not a valid keyword. Keywords must be non null strings.".format(keyword)) -''' + def validate_keyword(keyword): """ @@ -28,25 +26,6 @@ def validate_keyword(keyword): ValidationError If the kw argument is not a valid string. """ - '''if is_null(kw) or not isinstance(kw, str): - raise ValidationError( - f"'{kw}' not a valid keyword. Keywords must be valid strings." - )''' if is_null(keyword) or not isinstance(keyword, str): raise ValidationError("{} not a valid keyword. Keywords must be non null strings.".format(keyword)) - -'''def validate_keyword_list(values): - """ - This function takes a list of keyword values and validates that each one is valid. - A valid keyword is a non-null string. The validate_keyword function will raise an - ValidationError if any of the keywords are invalid. - - Parameters - __________ - values : list[str] - The list of values to be validated. - """ - for value in values: - if not is_null(value): - validate_keyword(value)''' From 0a1ad9bd1ed45285a84250d81eec92efde5e949e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:05:10 -0700 Subject: [PATCH 656/877] edit ScoreSet pydantic model and validation to reflect API changes --- mavecore/models/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index a4259a4..e193859 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -26,15 +26,15 @@ class Experiment(DataSet): class ScoreSet(DataSet): - urn: Optional[str] dataUsagePolicy: str licenceId: int - replacesId: Optional[int] experimentUrn: str + supersededScoresetUrn: Optional[str] + metaAnalysisSourceScoresetUrns: Optional[List[str]] doiIdentifiers: Optional[List[DoiIdentifier]] pubmedIdentifiers: Optional[List[PubmedIdentifier]] targetGene: TargetGene - @validator('urn') + @validator('experimentUrn', 'supersededScoresetUrn') def validate_matches_regular_expression(cls, v): urn.validate_mavedb_urn_scoreset(v) From 32345bff7f7ee934f636591fc4406c34f8d42eb1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:06:07 -0700 Subject: [PATCH 657/877] edit Identifier pydantic model to reflect API changes --- mavecore/models/identifier.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 73b2b92..46e4954 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -6,8 +6,6 @@ class Identifier(BaseModel): identifier: str - id: Optional[int] - url: Optional[HttpUrl] class DoiIdentifier(Identifier): @@ -18,7 +16,6 @@ def must_be_valid_doi(cls, v): class PubmedIdentifier(Identifier): - referenceHtml: Optional[str] @validator('identifier') def must_be_valid_pubmed(cls, v): From 129a7d2f060225ad9dac90fea286d6ceefa2dc06 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:06:23 -0700 Subject: [PATCH 658/877] add ExternalIdentifier pydantic models to reflect API changes --- mavecore/models/identifier.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 46e4954..ff3a8be 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -20,3 +20,13 @@ class PubmedIdentifier(Identifier): @validator('identifier') def must_be_valid_pubmed(cls, v): identifier.validate_pubmed_identifier(v) + + +class ExternalIdentifierId(BaseModel): + dbname: str + identifier: str + + +class ExternalIdentifier(BaseModel): + identifier: ExternalIdentifierId + offset: int From 7f3d26aa0d3463a0e46e7a826a4676170c0e8ffb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:06:46 -0700 Subject: [PATCH 659/877] edit TargetGene pydantic model to reflect API changes --- mavecore/models/target.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 588ad9a..4c7078f 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -10,9 +10,7 @@ class TargetGene(BaseModel): name: str category: str - ensembleIdId: Optional[int] - refseqIdId: Optional[int] - uniprotIdId: Optional[int] + externalIdentifiers: List[ExternalIdentifier] referenceMaps: List[ReferenceMap] wtSequence: WildType From 20f746a0e8f8dba8a37f1a9c883603be34261417 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:06:53 -0700 Subject: [PATCH 660/877] edit imports --- mavecore/models/target.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 4c7078f..409eb79 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -5,6 +5,7 @@ from .sequence import WildType from mavecore.validation import target +from mavecore.models.identifier import ExternalIdentifier class TargetGene(BaseModel): From 26fcf62b622d8747f652ae71d9c25bb025018d42 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:34:06 -0700 Subject: [PATCH 661/877] make note to update regex for tmp URN to UUID --- mavecore/validation/constants/urn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/constants/urn.py b/mavecore/validation/constants/urn.py index 74d6b16..8bfd59e 100644 --- a/mavecore/validation/constants/urn.py +++ b/mavecore/validation/constants/urn.py @@ -8,10 +8,11 @@ # Temp URN patterns # --------------------------------------------------------------------------- # -MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( - width=MAVEDB_TMP_URN_DIGITS -) -MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) +#TODO get tmp pattern from UUID4 regex +#MAVEDB_TMP_URN_PATTERN = r"^tmp:[A-Za-z0-9]{{{width}}}$".format( +# width=MAVEDB_TMP_URN_DIGITS +#) +#MAVEDB_TMP_URN_RE = re.compile(MAVEDB_TMP_URN_PATTERN) # Experimentset Pattern/Compiled RE From 6dc2d0f4a4759ad647813fe245cd13948f3029d2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:34:28 -0700 Subject: [PATCH 662/877] comment out tmp URN constants --- mavecore/validation/constants/urn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/constants/urn.py b/mavecore/validation/constants/urn.py index 8bfd59e..99c816a 100644 --- a/mavecore/validation/constants/urn.py +++ b/mavecore/validation/constants/urn.py @@ -1,7 +1,7 @@ import re MAVEDB_EXPERIMENTSET_URN_DIGITS = 8 -MAVEDB_TMP_URN_DIGITS = 16 +#MAVEDB_TMP_URN_DIGITS = 16 MAVEDB_URN_MAX_LENGTH = 64 MAVEDB_URN_NAMESPACE = "mavedb" @@ -48,7 +48,7 @@ MAVEDB_EXPERIMENT_URN_PATTERN, MAVEDB_SCORESET_URN_PATTERN, MAVEDB_VARIANT_URN_PATTERN, - MAVEDB_TMP_URN_PATTERN, + #MAVEDB_TMP_URN_PATTERN, ) ] ) From 2551181fdc7da74cf8a3919f4c8c0fdf876fe5a2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:34:40 -0700 Subject: [PATCH 663/877] edit imports --- mavecore/validation/urn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/urn.py b/mavecore/validation/urn.py index bbda1b6..c753dbe 100644 --- a/mavecore/validation/urn.py +++ b/mavecore/validation/urn.py @@ -1,4 +1,4 @@ -import re +import uuid from mavecore.validation.exceptions import ValidationError from mavecore.validation.constants.urn import * From 8d553e07f4702bfd70fc808ab73af88f3489ff4d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:35:33 -0700 Subject: [PATCH 664/877] update URN validation for tmp URN --- mavecore/validation/urn.py | 69 +++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/mavecore/validation/urn.py b/mavecore/validation/urn.py index c753dbe..585fd80 100644 --- a/mavecore/validation/urn.py +++ b/mavecore/validation/urn.py @@ -18,8 +18,16 @@ def validate_mavedb_urn(urn): ValidationError If the MaveDB urn is not valid. """ - if not MAVEDB_ANY_URN_RE.match(urn): - raise ValidationError("{}'s is not a valid urn.".format(urn)) + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not MAVEDB_ANY_URN_RE.match(urn): + raise ValidationError("{}'s is not a valid urn.".format(urn)) def validate_mavedb_urn_experimentset(urn): @@ -36,11 +44,18 @@ def validate_mavedb_urn_experimentset(urn): ValidationError If the Experiment Set urn is not valid. """ - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - # "Error test" - "{}'s is not a valid Experiment Set urn.".format(urn) - ) + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) def validate_mavedb_urn_experiment(urn): @@ -57,10 +72,18 @@ def validate_mavedb_urn_experiment(urn): ValidationError If the Experiemnt urn is not valid. """ - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError( - "{}'s is not a valid Experiment urn.".format(urn) - ) + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError( + "{}'s is not a valid Experiment urn.".format(urn) + ) def validate_mavedb_urn_scoreset(urn): @@ -77,8 +100,16 @@ def validate_mavedb_urn_scoreset(urn): ValidationError If the Scoreset urn is not valid. """ - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError("{}'s is not a valid score set urn.".format(urn)) + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError("{}'s is not a valid score set urn.".format(urn)) def validate_mavedb_urn_variant(urn): @@ -95,5 +126,13 @@ def validate_mavedb_urn_variant(urn): ValidationError If the MaveDB Variant urn is not valid. """ - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): - raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) + if urn.startswith("tmp:"): + try: + uuid.UUID(urn[4:]) + except ValueError: + raise ValidationError( + "{}'s is not a valid Experiment Set urn.".format(urn) + ) + else: + if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) From 9f5ec67f7f83ad113309dea12d8d7c5e7c5c3574 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:50:09 -0700 Subject: [PATCH 665/877] catch pydantic errors when validating --- mavecore/validation/validate.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index 8b081ac..c41732d 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -24,9 +24,15 @@ def validate(dataset, dataset_type, scores=None, counts=None): If the dataset_type attribute is not a string that reads `experiments` or `scoresets`. """ if dataset_type == "experiments": - Experiment.parse_obj(dataset) + try: + Experiment.parse_obj(dataset) + except ValidationError as e: + print(e.json()) elif dataset_type == "scoresets": - ScoreSet.parse_obj(dataset) + try: + ScoreSet.parse_obj(dataset) + except ValidationError as e: + print(e.json()) validate_dataframes(scores=scores, counts=counts) else: raise ValueError("The dataset_type must be a string that reads `experiments` or `scoresets`.") From fe940659bd755dcc48c7407ab361dcfee56685cb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:50:20 -0700 Subject: [PATCH 666/877] edit imports --- mavecore/validation/validate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index c41732d..88cdf33 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -1,3 +1,4 @@ +from exceptions import ValidationError from mavecore.models.data import Experiment, ScoreSet from mavecore.validation.dataframe import validate_dataframes From 04cb68105ac6eaa3e0f01a071afcffcb11d5046d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:51:11 -0700 Subject: [PATCH 667/877] edit format --- mavecore/validation/variant.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/variant.py b/mavecore/validation/variant.py index c01c3c0..adf785d 100644 --- a/mavecore/validation/variant.py +++ b/mavecore/validation/variant.py @@ -14,6 +14,7 @@ from mavecore.validation.utilities import is_null + def validate_hgvs_string( value: Union[str, bytes], column: Optional[str] = None, From 4fa55d8967f879c5e49399f2d408c3e1af46048f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 16:33:11 -0700 Subject: [PATCH 668/877] update pydantic urn validation --- mavecore/models/data.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index e193859..3228598 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -35,6 +35,13 @@ class ScoreSet(DataSet): pubmedIdentifiers: Optional[List[PubmedIdentifier]] targetGene: TargetGene - @validator('experimentUrn', 'supersededScoresetUrn') - def validate_matches_regular_expression(cls, v): - urn.validate_mavedb_urn_scoreset(v) + @validator('supersededScoresetUrn', 'metaAnalysisSourceScoresetUrns') + def validate_scoreset_urn(cls, v): + if type(v) == str: + urn.validate_mavedb_urn_scoreset(v) + else: + [urn.validate_mavedb_urn_scoreset(s) for s in v] + + @validator('experimentUrn') + def validate_experiment_urn(cls, v): + urn.validate_mavedb_urn_experiment(v) From 77d793a501cbbc1d7feab2a24f30c35e0a4a1078 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 16:33:52 -0700 Subject: [PATCH 669/877] edit setup method for TestScoreSet test model --- tests/models/data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/data.py b/tests/models/data.py index 42007d9..c581396 100644 --- a/tests/models/data.py +++ b/tests/models/data.py @@ -60,11 +60,11 @@ def setUp(self): pubmed_identifier = {"identifier": "29785012"} reference_map = {"genomeId": 0, "targetId": 0} sequence = {"sequenceType": "DNA", "sequence": "ATCG"} + external_identifier_id = {"dbname": "str", "identifier": "str"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} target = {"name": "name", "category": "Protein coding", - "ensembleIdId": 0, - "refseqIdId": 0, - "uniprotIdId": 0, + "externalIdentifiers": [external_identifier], "referenceMaps": [reference_map], "wtSequence": sequence} self.scoreset = { @@ -73,12 +73,12 @@ def setUp(self): "abstractText": "abstract", "methodText": "methods", "extraMetadata": {}, - # "urn": "urn", "dataUsagePolicy": "policy", "licenceId": 0, - "replacesId": 0, "keywords": ["string"], - "experimentUrn": "urn", + "experimentUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "supersededScoresetUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "metaAnalysisSourceScoresetUrns": ["tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5"], "doiIdentifiers": [doi_identifier], "pubmedIdentifiers": [pubmed_identifier], "targetGene": target, From 1b6c60300f77ed32ef280d151bc0fbb2ea34bce2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 16:34:19 -0700 Subject: [PATCH 670/877] delete optional attributes in test method --- tests/models/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/data.py b/tests/models/data.py index c581396..d3aee0e 100644 --- a/tests/models/data.py +++ b/tests/models/data.py @@ -90,8 +90,9 @@ def test_valid_all_fields(self): def test_valid_exclude_optional(self): self.scoreset.pop("extraMetadata") self.scoreset.pop("keywords") - self.scoreset.pop("replacesId") self.scoreset.pop("doiIdentifiers") self.scoreset.pop("pubmedIdentifiers") + self.scoreset.pop("supersededScoresetUrn") + self.scoreset.pop("metaAnalysisSourceScoresetUrns") ScoreSet.parse_obj(self.scoreset) From c02fc9de7dda886d39579ccd341150314fd1a69d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 16:35:19 -0700 Subject: [PATCH 671/877] update urn validation functions --- mavecore/validation/urn.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/urn.py b/mavecore/validation/urn.py index 585fd80..d8f3d96 100644 --- a/mavecore/validation/urn.py +++ b/mavecore/validation/urn.py @@ -4,7 +4,6 @@ def validate_mavedb_urn(urn): - # TODO, currently not functioning in MaveDB """ This function validates a MaveDB urn and raises an error if it is not valid. @@ -52,7 +51,7 @@ def validate_mavedb_urn_experimentset(urn): "{}'s is not a valid Experiment Set urn.".format(urn) ) else: - if not (MAVEDB_EXPERIMENTSET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not MAVEDB_EXPERIMENTSET_URN_RE.match(urn): raise ValidationError( "{}'s is not a valid Experiment Set urn.".format(urn) ) @@ -80,7 +79,7 @@ def validate_mavedb_urn_experiment(urn): "{}'s is not a valid Experiment Set urn.".format(urn) ) else: - if not (MAVEDB_EXPERIMENT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not MAVEDB_EXPERIMENT_URN_RE.match(urn): raise ValidationError( "{}'s is not a valid Experiment urn.".format(urn) ) @@ -108,7 +107,7 @@ def validate_mavedb_urn_scoreset(urn): "{}'s is not a valid Experiment Set urn.".format(urn) ) else: - if not (MAVEDB_SCORESET_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not MAVEDB_SCORESET_URN_RE.match(urn): raise ValidationError("{}'s is not a valid score set urn.".format(urn)) @@ -134,5 +133,5 @@ def validate_mavedb_urn_variant(urn): "{}'s is not a valid Experiment Set urn.".format(urn) ) else: - if not (MAVEDB_VARIANT_URN_RE.match(urn) or MAVEDB_TMP_URN_RE.match(urn)): + if not MAVEDB_VARIANT_URN_RE.match(urn): raise ValidationError("{}'s is not a valid Variant urn.".format(urn)) From 8139f7318c42c90cf03ee3e887348012561e7eb1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:01:44 -0700 Subject: [PATCH 672/877] edit validate_dataframes function signature --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 4e70bd5..5c9ee8e 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -6,7 +6,7 @@ from mavecore.validation.variant import validate_hgvs_string -def validate_dataframes(scores=None, counts=None): +def validate_dataframes(target_seq, scores, counts=None): """ Validates scores and counts dataframes for MaveDB upload. This function performs comprehensive validation. From 4d967f2f5b92000ca96b2fa1a55d9d3571e9b054 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:02:50 -0700 Subject: [PATCH 673/877] update validate_values_by_column implementation --- mavecore/validation/dataframe.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 5c9ee8e..0412e83 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -95,7 +95,7 @@ def validate_column_names(columns): # also make sure counts df has a counts column -def validate_values_by_column(dataset): +def validate_values_by_column(dataset, target_seq): """ Validates a string of variants and verifies that the variant type in the column name makes sense with regards to the actual variants. @@ -112,13 +112,15 @@ def validate_values_by_column(dataset): ValidationError If any variant in the list of variants does not adhere to the mavehgvs specifications. """ + # make sure target seq is the right type + # no protein target with just nt variants for column in dataset.columns: if column == hgvs_nt_column: - dataset[[hgvs_nt_column]].apply(validate_hgvs_string(column="nt")) + dataset[[hgvs_nt_column]].apply(validate_hgvs_string(column="nt", targetseq=target_seq)) elif column == hgvs_pro_column: - dataset[[hgvs_pro_column]].apply(validate_hgvs_string(column="p")) + dataset[[hgvs_pro_column]].apply(validate_hgvs_string(column="p", targetseq=target_seq)) elif column == hgvs_splice_column: - dataset[[hgvs_splice_column]].apply(validate_hgvs_string(column="splice")) + dataset[[hgvs_splice_column]].apply(validate_hgvs_string(column="splice", targetseq=target_seq)) elif column == required_score_column: dataset[[required_score_column]].apply(validate_score()) else: From 246ceb3dc4583f2f7fcb63fe88308f380e67d3c2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:03:08 -0700 Subject: [PATCH 674/877] update calls to validate_values_by_column to reflect changes in function signature --- mavecore/validation/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 0412e83..fdf9b24 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -25,11 +25,11 @@ def validate_dataframes(target_seq, scores, counts=None): """ validate_no_null_columns_or_rows(scores) validate_column_names(scores.columns) - validate_values_by_column(scores) + validate_values_by_column(scores, target_seq) if counts is not None: validate_no_null_columns_or_rows(counts) validate_column_names(counts.columns) - validate_values_by_column(counts) + validate_values_by_column(counts, target_seq) validate_dataframes_define_same_variants(scores, counts) From 17c3135e0e7c2bd468381186764e5a95cb223bff Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:03:29 -0700 Subject: [PATCH 675/877] delete TODO --- tests/validation/urn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index 0bb122c..d732762 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -42,7 +42,6 @@ def test_invalid_mavedb_urn_variant(self): class TestValidateTmpUrn(TestCase): - # TODO consider the way we are making the tmp urn strings def test_valid_tmp_mavedb_urn(self): validate_mavedb_urn("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") From 7a7d0810aa668c3e83f224b3d2ced83d74e8056e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:10:16 -0700 Subject: [PATCH 676/877] update call to validate_dataframes to reflect changes in function signature --- mavecore/validation/validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index 88cdf33..e699889 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -34,6 +34,7 @@ def validate(dataset, dataset_type, scores=None, counts=None): ScoreSet.parse_obj(dataset) except ValidationError as e: print(e.json()) - validate_dataframes(scores=scores, counts=counts) + target_seq = dataset["targetGene"]["wtSequence"]["sequence"] + validate_dataframes(target_seq=target_seq, scores=scores, counts=counts) else: raise ValueError("The dataset_type must be a string that reads `experiments` or `scoresets`.") From 7aee2b02f067c4f293988e734145de6c232f9272 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:11:06 -0700 Subject: [PATCH 677/877] add notes --- mavecore/validation/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index fdf9b24..dbf978e 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -128,8 +128,10 @@ def validate_values_by_column(dataset, target_seq): def validate_score(score): + # should be a float pass +# is the variant an actual variant with regards to the wt sequence def validate_hgvs_columns_define_same_variants(nt=None, pro=None): """ From cfe34c8b11536c5ad792bcd7956602f6ceb0a8a9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 20 Sep 2022 10:44:56 -0700 Subject: [PATCH 678/877] implement validate score --- mavecore/validation/dataframe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index dbf978e..182fd2b 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -128,8 +128,11 @@ def validate_values_by_column(dataset, target_seq): def validate_score(score): - # should be a float - pass + if type(score) != float: + raise ValidationError( + "Each value in score column must by a float. " + "'{}' has the type '{}'.".format(score, type(score).__name__) + ) # is the variant an actual variant with regards to the wt sequence From 169c72fec1e18af3ef872d3453721172a69f1400 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 20 Sep 2022 10:56:16 -0700 Subject: [PATCH 679/877] update model unittests to reflect pydantic model changes --- tests/models/identifier.py | 23 ----------------------- tests/models/target.py | 16 +++++++--------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index 894cf68..f57228b 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -7,30 +7,16 @@ class TestIdentifier(TestCase): def setUp(self): self.identifier = { "identifier": "10.1038/s41588-018-0122-z", - "id": 0, - "url": "https://www.uw.edu", } def test_valid_all_fields(self): Identifier.parse_obj(self.identifier) - def test_valid_exclude_optional(self): - self.identifier.pop("id") - self.identifier.pop("url") - Identifier.parse_obj(self.identifier) - - def test_invalid_url(self): - self.identifier["url"] = "www.uw.edu" - with self.assertRaises(ValidationError): - Identifier.parse_obj(self.identifier) - class TestDoiIdentifier(TestCase): def setUp(self): self.doi_identifier = { "identifier": "10.1038/s41588-018-0122-z", - "id": 0, - "url": "https://www.uw.edu", } def test_valid_all_fields(self): @@ -46,20 +32,11 @@ class TestPubmedIdentifier(TestCase): def setUp(self): self.pubmed_identifier = { "identifier": "29785012", - "id": 0, - "url": "https://www.uw.edu", - "referenceHtml": "referencehtml", } def test_valid_all_fields(self): PubmedIdentifier.parse_obj(self.pubmed_identifier) - def test_valid_exclude_optional(self): - self.pubmed_identifier.pop("id") - self.pubmed_identifier.pop("url") - self.pubmed_identifier.pop("referenceHtml") - PubmedIdentifier.parse_obj(self.pubmed_identifier) - def test_invalid_type_of_identifier(self): self.pubmed_identifier["identifier"] = "10.1038/s41588-018-0122-z" with self.assertRaises(ValidationError): diff --git a/tests/models/target.py b/tests/models/target.py index 102ed99..2adf670 100644 --- a/tests/models/target.py +++ b/tests/models/target.py @@ -7,15 +7,13 @@ class TestTargetGene(TestCase): def setUp(self): reference_map = {"genomeId": 0, "targetId": 0} sequence = {"sequenceType": "Protein", "sequence": "ATCG"} - self.target = { - "name": "name", - "category": "Protein coding", - "ensembleIdId": 0, - "refseqIdId": 0, - "uniprotIdId": 0, - "referenceMaps": [reference_map], - "wtSequence": sequence, - } + external_identifier_id = {"dbname": "str", "identifier": "str"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} + self.target = {"name": "name", + "category": "Protein coding", + "externalIdentifiers": [external_identifier], + "referenceMaps": [reference_map], + "wtSequence": sequence} def test_valid_all_fields(self): TargetGene.parse_obj(self.target) From c99b98a7830f6ebcbf41ebd190b65ecb0383bc76 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:13:43 -0700 Subject: [PATCH 680/877] reformat file --- tests/models/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/data.py b/tests/models/data.py index d3aee0e..174d26d 100644 --- a/tests/models/data.py +++ b/tests/models/data.py @@ -95,4 +95,3 @@ def test_valid_exclude_optional(self): self.scoreset.pop("supersededScoresetUrn") self.scoreset.pop("metaAnalysisSourceScoresetUrns") ScoreSet.parse_obj(self.scoreset) - From def399e051867330050d96b759754d0fa362e57e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:14:19 -0700 Subject: [PATCH 681/877] edit imports --- mavecore/models/identifier.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index ff3a8be..7ea1149 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -1,8 +1,7 @@ -from pydantic import BaseModel, ValidationError, validator, HttpUrl +from pydantic import BaseModel, validator, root_validator from typing import Optional -from mavecore.validation import identifier - +from mavecore.validation import identifier as id class Identifier(BaseModel): identifier: str From 541d1a2457bcf300edde448dac1f03d6cbc8284a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:14:41 -0700 Subject: [PATCH 682/877] edit function calling --- mavecore/models/identifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 7ea1149..d88c0fb 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -11,14 +11,14 @@ class DoiIdentifier(Identifier): @validator('identifier') def must_be_valid_doi(cls, v): - identifier.validate_doi_identifier(v) + id.validate_doi_identifier(v) class PubmedIdentifier(Identifier): @validator('identifier') def must_be_valid_pubmed(cls, v): - identifier.validate_pubmed_identifier(v) + id.validate_pubmed_identifier(v) class ExternalIdentifierId(BaseModel): From 87396b26f76c87808e7f9621833212580449f9dc Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:15:02 -0700 Subject: [PATCH 683/877] comment out ExternalIdentifierId --- mavecore/models/identifier.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index d88c0fb..e154123 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -21,10 +21,31 @@ def must_be_valid_pubmed(cls, v): id.validate_pubmed_identifier(v) -class ExternalIdentifierId(BaseModel): +'''class ExternalIdentifierId(BaseModel): dbname: str identifier: str + @root_validator(pre=True) + def check_passwords_match(cls, values): + print(values.get("dbname")) + # TODO resolve errors when using root_validator + #TODO confirm what valid dbname(s) are + #dbname, dbid = values.get('dbname'), values.get('identifier') + #print(dbname) + #print(dbid) + #if dbname == "sra": + # identifier.validate_sra_identifier(dbid) + #elif dbname == "ensembl": + # identifier.validate_ensembl_identifier(dbid) + #elif dbname == "uniprot": + # identifier.validate_uniprot_identifier(dbid) + #elif dbname == "refseq": + # identifier.validate_refseq_identifier(dbid) + #elif dbname == "genome": + # identifier.validate_genome_identifier(dbid) + #else: + # raise ValidationError("dbname must be valid dbname from this list: ")''' + class ExternalIdentifier(BaseModel): identifier: ExternalIdentifierId From 66f0715b78aaab89da9c90ef9fdb717a0a6738eb Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:15:37 -0700 Subject: [PATCH 684/877] reimplement ExternalIdentifier --- mavecore/models/identifier.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index e154123..eef156f 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -48,5 +48,12 @@ def check_passwords_match(cls, values): class ExternalIdentifier(BaseModel): - identifier: ExternalIdentifierId - offset: int + identifier: dict + offset: Optional[int] + + # TODO validate the offset in relation to the ExternalIdentifier + @validator('identifier') + def validate_identifier(cls, v): + id.validate_external_identifier(v) + + From 8537f58ef1525250c1520a40666522853968e510 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:15:50 -0700 Subject: [PATCH 685/877] add valid list of dbnames --- mavecore/validation/constants/identifier.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 mavecore/validation/constants/identifier.py diff --git a/mavecore/validation/constants/identifier.py b/mavecore/validation/constants/identifier.py new file mode 100644 index 0000000..b55c8ca --- /dev/null +++ b/mavecore/validation/constants/identifier.py @@ -0,0 +1 @@ +valid_dbnames = ["uniprot"] \ No newline at end of file From fc003878548b513bd73b0d7e6be82b747baaf5c0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:16:13 -0700 Subject: [PATCH 686/877] add function for validating external identifiers --- mavecore/validation/identifier.py | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index 0b7aa84..49b2412 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -2,6 +2,49 @@ from mavecore.validation.exceptions import ValidationError from mavecore.validation.utilities import is_null +from mavecore.validation.constants.identifier import valid_dbnames + + +def validate_external_identifier(identifier): + """ + Validates an external identifier represented as a dictionary. The dictionary should have a length of 2 + and have the keys `dbname` and `identifier`, both with str values. The valid values for these keys are + stored in lists within the identifier file in constants directory. + + Parameters + __________ + identifier: dict + The identifier to be validated. + + Raises + ______ + ValidationError + If the length of the dictionary is not 2. + ValidationError + If the keys do not have the correct name. + ValidationError + If the `dbname` value is not valid. + ValidationError + If the `identifier` value is not correct as it relates to the `dbname` value. + """ + if len(identifier) != 2: + raise ValidationError("The identifier attribute of the external identifier should have two keys, `dbname` " + "and `identifier`.") + # check that the keys are the right name + elif "dbname" not in identifier: + raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " + "and `identifier`.") + elif "identifier" not in identifier: + raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " + "and `identifier`.") + # check that dbname is valid + elif identifier.get("dbname") not in valid_dbnames: + raise ValidationError(f"The `dbname` key within the identifier attribute of the external identifier should " + f"take one of the following values: {valid_dbnames}.") + # validate identifier based on dbname + elif identifier.get("dbname") == "uniprot": + validate_uniprot_identifier(identifier.get("identifier")) + # TODO add other conditions like the one above def validate_sra_identifier(identifier): From 9d0eeed3ece77edd151685fa32230592de0ac9fe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:16:25 -0700 Subject: [PATCH 687/877] edit imports --- tests/models/identifier.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index f57228b..faf6948 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -1,6 +1,11 @@ from unittest import TestCase from pydantic import ValidationError -from mavecore.models.identifier import Identifier, DoiIdentifier, PubmedIdentifier +from mavecore.models.identifier import (Identifier, + DoiIdentifier, + PubmedIdentifier, + #ExternalIdentifierId, + ExternalIdentifier) + class TestIdentifier(TestCase): From 6e24bf0ac425b8c64ecf1f9923a79575543d566c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:16:47 -0700 Subject: [PATCH 688/877] test ExternalIdentifier --- tests/models/identifier.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index faf6948..7bf787e 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -46,3 +46,16 @@ def test_invalid_type_of_identifier(self): self.pubmed_identifier["identifier"] = "10.1038/s41588-018-0122-z" with self.assertRaises(ValidationError): PubmedIdentifier.parse_obj(self.pubmed_identifier) + + +class TestExternalIdentifier(TestCase): + def setUp(self): + self.external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} + self.external_identifier = {"identifier": self.external_identifier_id, "offset": 0} + + """def test_valid_external_identifier_id(self): + print(hasattr(self.external_identifier_id, "dbname")) + ExternalIdentifierId.parse_obj(self.external_identifier_id)""" + + def test_valid_external_identifier(self): + ExternalIdentifier.parse_obj(self.external_identifier) \ No newline at end of file From 919e1b3ee7a8546691c4a42477a5a6a481f96232 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:17:09 -0700 Subject: [PATCH 689/877] edit attribute fields to pass validation --- tests/models/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/target.py b/tests/models/target.py index 2adf670..e6deb5f 100644 --- a/tests/models/target.py +++ b/tests/models/target.py @@ -7,7 +7,7 @@ class TestTargetGene(TestCase): def setUp(self): reference_map = {"genomeId": 0, "targetId": 0} sequence = {"sequenceType": "Protein", "sequence": "ATCG"} - external_identifier_id = {"dbname": "str", "identifier": "str"} + external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} external_identifier = {"identifier": external_identifier_id, "offset": 0} self.target = {"name": "name", "category": "Protein coding", From b3db7c3f7d1be0015599c58c065d224df4c68ad6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 13:17:28 -0700 Subject: [PATCH 690/877] edit attribute fields to pass validation --- tests/validation/urn.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index d732762..4723144 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -50,8 +50,7 @@ def test_invalid_tmp_mavedb_urn(self): validate_mavedb_urn("urn:mavedb:00000002-a-1-z") def test_valid_tmp_mavedb_urn_experimentset(self): - validate_mavedb_urn_experimentset("") - pass + validate_mavedb_urn_experimentset("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") def test_invalid_tmp_mavedb_urn_experimentset(self): with self.assertRaises(ValidationError): @@ -65,14 +64,14 @@ def test_invalid_tmp_mavedb_urn_experiment(self): validate_mavedb_urn_experiment("") def test_valid_tmp_mavedb_urn_scoreset(self): - validate_mavedb_urn_scoreset("tmp:a56b8eb08e190490") + validate_mavedb_urn_scoreset("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") def test_invalid_tmp_mavedb_urn_scoreset(self): with self.assertRaises(ValidationError): validate_mavedb_urn_scoreset("") def test_valid_tmp_mavedb_urn_variant(self): - validate_mavedb_urn_variant("") + validate_mavedb_urn_variant("tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5") def test_invalid_tmp_mavedb_urn_variant(self): with self.assertRaises(ValidationError): From 924c2e71b24807e1d6abb395cec23cc65e0d0e8e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 14:23:22 -0700 Subject: [PATCH 691/877] update imports --- tests/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 2fce730..938f200 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -2,6 +2,7 @@ import pandas as pd from mavecore.validation.dataframe import * +from mavecore.validation.constants.general import null_values_list """ from io import BytesIO, StringIO From 5d9fea024baf34424a6b7ef2aac2bcf1bbab7c46 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 14:24:21 -0700 Subject: [PATCH 692/877] add test cases for validating column names --- tests/validation/dataframe.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 938f200..9d1af15 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -66,6 +66,14 @@ def setUp(self): def test_valid_column_names(self): validate_column_names(self.dataframe.columns) + def test_valid_just_hgvs_nt_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) + validate_column_names(self.dataframe.columns) + + def test_valid_just_hgvs_pro_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + validate_column_names(self.dataframe.columns) + def test_missing_hgvs_column(self): self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column], axis=1) with self.assertRaises(ValidationError): @@ -81,11 +89,17 @@ def test_no_additional_columns_beyond_hgvs(self): with self.assertRaises(ValidationError): validate_column_names(self.dataframe.columns) - def test_null_column_name(self): - self.dataframe.rename(columns={hgvs_splice_column: 'null'}, inplace=True) - with self.assertRaises(ValidationError): + def test_hgvs_columns_must_be_lowercase(self): + self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}, inplace=True) + with self.assertRaises(ValueError): validate_column_names(self.dataframe.columns) + def test_null_column_name(self): + for value in null_values_list: + self.dataframe.rename(columns={hgvs_splice_column: value}, inplace=True) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe.columns) + class TestValidateVariants(TestCase): def setUp(self): From d9f1c84312457a7ac70c6dab75d0d06d3179c6d0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 14:24:34 -0700 Subject: [PATCH 693/877] delete redundant test cases --- tests/validation/dataframe.py | 74 ----------------------------------- 1 file changed, 74 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 9d1af15..9fc098b 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -166,80 +166,6 @@ def test_counts_defines_different_pro_variants(self): with self.assertRaises(ValidationError): validate_dataframes_define_same_variants(self.scores, self.counts) - -class TestNoNullInColumnsValidator(TestCase): - """ - Tests to ensure that an input file contains no null values in the header - such as '', None, null etc. - """ - - def test_raises_valuerror_when_null_values_in_column(self): - '''for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header)''' - - def test_does_not_raise_valuerror_when_non_null_values_in_column( - self, - ): - '''file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_header_contains_no_null_columns(header) # Should pass''' - - -class TestAtLeastOneNumericColumnValidator(TestCase): - """ - Tests to ensure that an input file contains at least two columns. - """ - - def test_raises_valuerror_when_less_than_2_values_in_column(self): - '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_at_least_one_additional_column(header)''' - - def test_does_not_raise_valuerror_2_or_more_values_in_column(self): - '''file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass - - file = BytesIO("{},score\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_at_least_one_additional_column(header) # Should pass''' - - -class TestHgvsInHeaderValidator(TestCase): - """ - Tests that case-sensitive 'hgvs' is in the header of a file. - """ - - def test_raises_valuerror_when_neither_hgvs_col_in_column(self): - '''file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header)''' - - def test_hgvs_must_be_lowercase(self): - '''file = BytesIO( - "{},score,count\n".format(constants.hgvs_nt_column.upper()).encode() - ) - with self.assertRaises(ValueError): - header = read_header_from_io(file) - validate_has_hgvs_in_header(header)''' - - def test_does_not_raise_valuerror_when_either_hgvs_in_column(self): - '''file = BytesIO("{},score,count\n".format(constants.hgvs_nt_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass - - file = BytesIO("{},score,count\n".format(constants.hgvs_pro_column).encode()) - header = read_header_from_io(file) - validate_has_hgvs_in_header(header) # Should pass''' - - class TestValidateScoreCountsDefineSameVariants(TestCase): """ Tests that an uploaded score/counts files define the same variants From 5f5985e4cab18b9281b12c6ee4501c1e0e245820 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 14:58:31 -0700 Subject: [PATCH 694/877] add dataset validation test cases --- tests/validation/dataframe.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 9fc098b..91101f9 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -63,9 +63,14 @@ def setUp(self): } ) - def test_valid_column_names(self): + def test_valid_scores_column_names(self): validate_column_names(self.dataframe.columns) + def test_valid_counts_column_names(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + self.dataframe["count"] = [5] + validate_column_names(self.dataframe.columns, scores=False) + def test_valid_just_hgvs_nt_hgvs_column(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) validate_column_names(self.dataframe.columns) @@ -84,11 +89,16 @@ def test_hgvs_in_wrong_location(self): with self.assertRaises(ValidationError): validate_column_names(self.dataframe.columns) - def test_no_additional_columns_beyond_hgvs(self): + def test_no_additional_columns_beyond_hgvs_scores_df(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) with self.assertRaises(ValidationError): validate_column_names(self.dataframe.columns) + def test_no_additional_columns_beyond_hgvs_counts_df(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe.columns, scores=False) + def test_hgvs_columns_must_be_lowercase(self): self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}, inplace=True) with self.assertRaises(ValueError): @@ -100,6 +110,17 @@ def test_null_column_name(self): with self.assertRaises(ValidationError): validate_column_names(self.dataframe.columns) + def test_no_score_column_with_scores_df(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + self.dataframe["count"] = [1] + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe.columns) + + def test_no_additional_column_with_counts_df(self): + self.dataframe = self.dataframe.drop([required_score_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe.columns, scores=False) + class TestValidateVariants(TestCase): def setUp(self): From 886e0add45c16e9640d3594e2710b2862b32f4b9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 23 Sep 2022 14:58:43 -0700 Subject: [PATCH 695/877] delete redundant test cases --- tests/validation/dataframe.py | 180 ---------------------------------- 1 file changed, 180 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 91101f9..df1a286 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -187,186 +187,6 @@ def test_counts_defines_different_pro_variants(self): with self.assertRaises(ValidationError): validate_dataframes_define_same_variants(self.scores, self.counts) -class TestValidateScoreCountsDefineSameVariants(TestCase): - """ - Tests that an uploaded score/counts files define the same variants - in both the _nt column and _pro column. - """ - - def test_ve_counts_defines_different_nt_variants(self): - '''scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - constants.hgvs_splice_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts)''' - - def test_ve_counts_defines_different_splice_variants(self): - '''scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.1A>G"], - constants.hgvs_pro_column: [None], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: ["c.2A>G"], - constants.hgvs_pro_column: [None], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts)''' - - def test_ve_counts_defines_different_pro_variants(self): - '''scores = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu5Glu"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: [None], - constants.hgvs_splice_column: [None], - constants.hgvs_pro_column: ["p.Leu75Glu"], - } - ) - with self.assertRaises(ValueError): - validate_datasets_define_same_variants(scores, counts)''' - - def test_passes_when_same_variants_defined(self): - '''scores = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - counts = pd.DataFrame( - { - constants.hgvs_nt_column: ["c.1A>G"], - constants.hgvs_pro_column: ["p.Leu5Glu"], - constants.hgvs_splice_column: ["c.1A>G"], - } - ) - validate_datasets_define_same_variants(scores, counts)''' - - -class TestValidateScoreSetCountDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted count data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - '''file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file)''' - - def test_raises_valuerror_no_numeric_column(self): - '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file)''' - - def test_raises_valuerror_when_null_values_in_column(self): - '''for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_count_data_input(file)''' - - -class TestValidateScoreSetScoreDataInputValidator(TestCase): - """ - Tests that validation errors are thrown when an ill-formatted score data - input file is supplied. - """ - - def test_raises_valuerror_when_hgvs_not_in_column(self): - '''file = BytesIO("score,count\n".encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file)''' - - def test_raises_valuerror_no_numeric_column(self): - '''file = BytesIO("{}\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file)''' - - def test_raises_valuerror_when_null_values_in_column(self): - '''for value in constants.null_values_list: - file = BytesIO( - "{},score,{}\n".format(constants.hgvs_nt_column, value).encode() - ) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file)''' - - def test_validatation_error_score_not_in_header(self): - '''file = BytesIO("{},count\n".format(constants.hgvs_nt_column).encode()) - with self.assertRaises(ValueError): - validate_scoreset_score_data_input(file)''' - - -class TestValidateScoreSetJsonValidator(TestCase): - """ - Test to ensure that a scoreset json field is properly formatted. - """ - - def test_valueerror_unexptected_columns(self): - '''field = { - "extra_column": [], - constants.score_columns: ["score"], - constants.count_columns: [], - } - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - def test_valueerror_values_not_lists(self): - '''field = {constants.score_columns: ["score"], constants.count_columns: {}} - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - def test_valueerror_list_values_not_strings(self): - '''field = {constants.score_columns: [b"score"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - def test_valueerror_empty_score_columns(self): - '''field = {constants.score_columns: [], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - def test_valueerror_missing_dict_columns(self): - '''# constants.score_columns missing - field = {constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field) - - # constants.count_columns missing - field = {constants.score_columns: ["score"]} - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - def test_valueerror_missing_header_columns(self): - '''# constants.score_columns columns missing 'score' - field = {constants.score_columns: ["hgvs"], constants.count_columns: []} - with self.assertRaises(ValueError): - validate_scoreset_json(field)''' - - """ from io import StringIO import unittest From ad9c382b57eada8c04f027e15b9aca7730e8e019 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:19:20 -0700 Subject: [PATCH 696/877] add dictionaries for converting values to utilities --- mavecore/validation/constants/conversion.py | 42 +++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 mavecore/validation/constants/conversion.py diff --git a/mavecore/validation/constants/conversion.py b/mavecore/validation/constants/conversion.py new file mode 100644 index 0000000..37bc20f --- /dev/null +++ b/mavecore/validation/constants/conversion.py @@ -0,0 +1,42 @@ +aa_dict_key_1 = { + "A": "Ala", "C": "Cys", "D": "Asp", "E": "Glu", "F": "Phe", + "G": "Gly", "H": "His", "I": "Ile", "K": "Lys", "L": "Leu", + "M": "Met", "N": "Asn", "P": "Pro", "Q": "Gln", "R": "Arg", + "S": "Ser", "T": "Thr", "V": "Val", "W": "Trp", "Y": "Tyr", + "*": "Ter" +} # what is Z? "X":"Ter", "WTSYN":""} + +aa_dict_key_3 = { + "Ala": "A", "Cys": "C", "Asp": "D", "Glu": "E", "Phe": "F", + "Gly": "G", "His": "H", "Ile": "I", "Lys": "K", "Leu": "L", + "Met": "M", "Asn": "N", "Pro": "P", "Gln": "Q", "Arg": "R", + "Ser": "S", "Thr": "T", "Val": "V", "Trp": "W", "Tyr": "Y", + "Ter": "*" +} + +codon_dict_DNA = { + # T + 'TTT': 'Phe', 'TCT': 'Ser', 'TAT': 'Tyr', 'TGT': 'Cys', # TxT + 'TTC': 'Phe', 'TCC': 'Ser', 'TAC': 'Tyr', 'TGC': 'Cys', # TxC + 'TTA': 'Leu', 'TCA': 'Ser', 'TAA': 'Ter', 'TGA': 'Ter', # TxA + 'TTG': 'Leu', 'TCG': 'Ser', 'TAG': 'Ter', 'TGG': 'Trp', # TxG + + # C + 'CTT': 'Leu', 'CCT': 'Pro', 'CAT': 'His', 'CGT': 'Arg', # CxT + 'CTC': 'Leu', 'CCC': 'Pro', 'CAC': 'His', 'CGC': 'Arg', # CxC + 'CTA': 'Leu', 'CCA': 'Pro', 'CAA': 'Gln', 'CGA': 'Arg', # CxA + 'CTG': 'Leu', 'CCG': 'Pro', 'CAG': 'Gln', 'CGG': 'Arg', # CxG + + # A + 'ATT': 'Ile', 'ACT': 'Thr', 'AAT': 'Asn', 'AGT': 'Ser', # AxT + 'ATC': 'Ile', 'ACC': 'Thr', 'AAC': 'Asn', 'AGC': 'Ser', # AxC + 'ATA': 'Ile', 'ACA': 'Thr', 'AAA': 'Lys', 'AGA': 'Arg', # AxA + 'ATG': 'Met', 'ACG': 'Thr', 'AAG': 'Lys', 'AGG': 'Arg', # AxG + + # G + 'GTT': 'Val', 'GCT': 'Ala', 'GAT': 'Asp', 'GGT': 'Gly', # GxT + 'GTC': 'Val', 'GCC': 'Ala', 'GAC': 'Asp', 'GGC': 'Gly', # GxC + 'GTA': 'Val', 'GCA': 'Ala', 'GAA': 'Glu', 'GGA': 'Gly', # GxA + 'GTG': 'Val', 'GCG': 'Ala', 'GAG': 'Glu', 'GGG': 'Gly' # GxG +} + From 6b17e39157294667eaf100df8cc4968cf07697ca Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:20:05 -0700 Subject: [PATCH 697/877] edit attribute values to for unittest setUp --- tests/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/data.py b/tests/models/data.py index 174d26d..570b14d 100644 --- a/tests/models/data.py +++ b/tests/models/data.py @@ -60,7 +60,7 @@ def setUp(self): pubmed_identifier = {"identifier": "29785012"} reference_map = {"genomeId": 0, "targetId": 0} sequence = {"sequenceType": "DNA", "sequence": "ATCG"} - external_identifier_id = {"dbname": "str", "identifier": "str"} + external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} external_identifier = {"identifier": external_identifier_id, "offset": 0} target = {"name": "name", "category": "Protein coding", From 8f97ecbb366e416a37a954cbcbfd8d608d65ba74 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:20:15 -0700 Subject: [PATCH 698/877] edit imports --- mavecore/validation/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 182fd2b..1314613 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -4,6 +4,8 @@ from mavecore.validation.constants.general import * from mavecore.validation.exceptions import ValidationError from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.utilities import construct_hgvs_pro, get_codon_data_from_nt_variants +from mavecore.validation.constants.conversion import codon_dict_DNA def validate_dataframes(target_seq, scores, counts=None): From b3c1720fa83700416ef030554c6b2113d62aa766 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:20:33 -0700 Subject: [PATCH 699/877] edit parameters to reflect changes in function signature --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 1314613..1fcc840 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -30,7 +30,7 @@ def validate_dataframes(target_seq, scores, counts=None): validate_values_by_column(scores, target_seq) if counts is not None: validate_no_null_columns_or_rows(counts) - validate_column_names(counts.columns) + validate_column_names(counts.columns, scores=False) validate_values_by_column(counts, target_seq) validate_dataframes_define_same_variants(scores, counts) From 5cde3444651e44f2ba95fa1f29e58475209a8396 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:22:58 -0700 Subject: [PATCH 700/877] edit validation to check for presence or absence of score column and data from dataset --- mavecore/validation/dataframe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 1fcc840..69ea023 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -58,7 +58,7 @@ def validate_no_null_columns_or_rows(dataframe): raise ValidationError("Dataset should not contain null columns or rows.") -def validate_column_names(columns): +def validate_column_names(columns, scores=True): """ This function validates the columns in a dataframe. The first columns should be an hgvs column such as hgvs_nt, hgvs_pro, and hgvs_splice. There should be at least @@ -79,10 +79,12 @@ def validate_column_names(columns): """ # count instances of hgvs columns count = 0 + score_column = False for i in range(len(columns)): # there should not be any null columns if columns[i] in readable_null_values_list: raise ValidationError("Column names must not be null.") if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: count+=1 + if columns[i] == required_score_column: score_column = True # there should be at least one hgvs column if count == 0: raise ValidationError("Must include hgvs_nt, hgvs_pro, or hgvs_splice column.") # first columns should be hgvs columns @@ -95,6 +97,11 @@ def validate_column_names(columns): # validate against UTF-8byte ordering marks # TODO if dataframe is a scores df make sure it has a score column # also make sure counts df has a counts column + if scores and not score_column: + raise ValidationError("A scores dataframe must include a `score` column.") + if not scores and score_column: + raise ValidationError("A counts dataframe should not include a `score` column, include `score` " + "column in a scores dataframe.") def validate_values_by_column(dataset, target_seq): From f5ab4dfd56465014c4d7f1310e61f4e8e999638d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:23:36 -0700 Subject: [PATCH 701/877] reimplement validate_values_by_column to check for inconsistencies between columns --- mavecore/validation/dataframe.py | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 69ea023..0eb2264 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -121,6 +121,45 @@ def validate_values_by_column(dataset, target_seq): ValidationError If any variant in the list of variants does not adhere to the mavehgvs specifications. """ + # first check the column names, establish the order or the hgvs and score columns + hgvs_nt = False + hgvs_pro = False + hgvs_splice = False + score = False + for column in dataset.columns: + if column == hgvs_nt_column: + hgvs_nt = True + elif column == hgvs_pro_column: + hgvs_pro = True + elif column == hgvs_splice_column: + hgvs_splice = True + elif column == required_score_column: + score = True + else: + raise ValidationError("Missing required hgvs and/or score columns.") + + # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another + for i in range(len(dataset)): + if hgvs_nt: + validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], + column="nt", + targetseq=target_seq) + if hgvs_pro: + validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], + column="p", + targetseq=target_seq) + if hgvs_splice: + validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], + column="splice", + targetseq=target_seq) + if score: + validate_score(dataset.loc[i, required_score_column]) + if hgvs_nt and hgvs_pro: + validate_hgvs_columns_define_same_variants(target_seq=target_seq, + nt=dataset.loc[i, hgvs_nt_column], + pro=dataset.loc[i, hgvs_pro_column], + row=i) + # make sure target seq is the right type # no protein target with just nt variants for column in dataset.columns: From d5eda1311f2c616064ad97e5f713c1e0a8c35512 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:24:00 -0700 Subject: [PATCH 702/877] delete obsolete code --- mavecore/validation/dataframe.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 0eb2264..4617e8d 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -162,17 +162,6 @@ def validate_values_by_column(dataset, target_seq): # make sure target seq is the right type # no protein target with just nt variants - for column in dataset.columns: - if column == hgvs_nt_column: - dataset[[hgvs_nt_column]].apply(validate_hgvs_string(column="nt", targetseq=target_seq)) - elif column == hgvs_pro_column: - dataset[[hgvs_pro_column]].apply(validate_hgvs_string(column="p", targetseq=target_seq)) - elif column == hgvs_splice_column: - dataset[[hgvs_splice_column]].apply(validate_hgvs_string(column="splice", targetseq=target_seq)) - elif column == required_score_column: - dataset[[required_score_column]].apply(validate_score()) - else: - pass def validate_score(score): From 553ba236a3bf48a5f84f5904d333d5a0a8914e00 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:24:08 -0700 Subject: [PATCH 703/877] remove comment --- mavecore/validation/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 4617e8d..83763c2 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -171,7 +171,6 @@ def validate_score(score): "'{}' has the type '{}'.".format(score, type(score).__name__) ) -# is the variant an actual variant with regards to the wt sequence def validate_hgvs_columns_define_same_variants(nt=None, pro=None): """ From 020692bf51ed5b203a41ff0a15cdad3b99f38bd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:25:38 -0700 Subject: [PATCH 704/877] implement function to ensure hgvs nt and pro define same change --- mavecore/validation/dataframe.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 83763c2..2de9b9e 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -172,7 +172,7 @@ def validate_score(score): ) -def validate_hgvs_columns_define_same_variants(nt=None, pro=None): +def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq, nt, pro, row): """ Checks that, when two or more of hgvs_nt, hgvs_pro, and hgvs_splice columns exist, the variant strings within those columns are representing the same change. @@ -189,7 +189,17 @@ def validate_hgvs_columns_define_same_variants(nt=None, pro=None): ValidationError If any of the variants within each column do not represent the same change. """ - pass + # first get data from nt variant + target_codon, codon_number, variant_codon = get_codon_data_from_nt_variants(hgvs_nt=nt, target_seq=target_seq) + # convert to 3 letter amino acid code + target_aa = codon_dict_DNA[target_codon] + variant_aa = codon_dict_DNA[variant_codon] + # then construct a protein variant from the data + nt_converted = construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number) + # compare nt_converted with pro + if nt_converted != pro: + raise ValidationError("The hgvs_nt variant {} and the hgvs_pro variant {} on row {} do not represent the " + "same change.".format(nt, pro, row)) def validate_dataframes_define_same_variants(scores, counts): From 69681a8ad59af756705d1150e4c197f4bf8e8e89 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:26:22 -0700 Subject: [PATCH 705/877] edit call to function to reflect change in function signature --- mavecore/validation/dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 2de9b9e..96bce17 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -155,10 +155,10 @@ def validate_values_by_column(dataset, target_seq): if score: validate_score(dataset.loc[i, required_score_column]) if hgvs_nt and hgvs_pro: - validate_hgvs_columns_define_same_variants(target_seq=target_seq, - nt=dataset.loc[i, hgvs_nt_column], - pro=dataset.loc[i, hgvs_pro_column], - row=i) + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, + nt=dataset.loc[i, hgvs_nt_column], + pro=dataset.loc[i, hgvs_pro_column], + row=i) # make sure target seq is the right type # no protein target with just nt variants From 11c5ce10921a3caa34183bd71d99b1490343906d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:26:40 -0700 Subject: [PATCH 706/877] edit imports --- mavecore/validation/utilities.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index c5fce5a..763b485 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -1,4 +1,10 @@ from mavecore.validation.constants.general import null_values_re +from random import choice + +from mavehgvs.variant import Variant +from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.constants.conversion import aa_dict_key_3, codon_dict_DNA +#from mavetools.mavedf.mutation_type import * def is_null(value): From 5416d463c6c684b4c4b1373ff3a7b70f3e5cb6f9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 26 Sep 2022 18:27:30 -0700 Subject: [PATCH 707/877] add utility functions for parsing and reconstructing hgvs strings of various types --- mavecore/validation/utilities.py | 263 +++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 763b485..390084f 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -23,3 +23,266 @@ def is_null(value): """ value = str(value).strip().lower() return null_values_re.fullmatch(value) or not value + + +def generate_hgvs(prefix: str = "c") -> str: + """ + Generates a random hgvs string from a small sample. + """ + if prefix == "p": + # Subset of 3-letter codes, chosen at random. + amino_acids = [ + "Ala", + "Leu", + "Gly", + "Val", + "Tyr", + "Met", + "Cys", + "His", + "Glu", + "Phe", + ] + ref = choice(amino_acids) + alt = choice(amino_acids) + return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" + else: + alt = choice("ATCG") + ref = choice("ATCG") + return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" + + +def construct_hgvs_pro(wt, mutant, position: int): + #if pd.isna(position): return None + if wt == mutant: + hgvs = "p." + wt + str(position) + "=" + else: + hgvs = "p." + wt + str(position) + mutant + # validate variant + validate_hgvs_string(hgvs) + return hgvs #, hgvs_validate + + +def get_codon_data_from_nt_variants(hgvs_nt, target_seq): + """ + This method takes in a target sequence and converts coding variants into codon changes. + These changes are stored in three additional MaveDf columns: target_codon, codon_number, and variant_codon. + This method also updates the hgvs from legacy to mave hgvs, if it is not already done. + + Parameters + __________ + target_seq : string + target sequence + + Raises + ______ + TypeError + if target_seq is not string + ValueError + if target_seq is not made solely of characters ACTG + """ + # check for TypeError + # if target_seq is not string + if not isinstance(target_seq, str): + raise TypeError("target_seq must be string") + + # check for ValueError + # if target_seq is not made solely of characters ACTG + check_chars = [letter in "ACTG" for letter in target_seq] + if False in check_chars: + raise ValueError("target_seq is invalid") + + # identify variant_position and get codon_number associated with it + + if is_wild_type(hgvs_nt): # variant_codon is wild-type + codon_number = None + target_codon = None + else: # any other variant change + # instantiate Variant object + variant = Variant(hgvs_nt) + # get variant position and convert to int + if type(variant.positions) == list: # multiple positions values exist + variant_position = int(str(variant.positions[0])) + elif type(variant.positions) == tuple: + variant_position = int(str(variant.positions[0])) + else: # only one value for positions + variant_position = int(str(variant.positions)) + # now that we have the variant_position, get codon_number + codon_number = round((variant_position / 3) + 0.5) + # use codon_number to get target_codon from target_seq + target_codon = target_seq[(codon_number - 1) * 3 : codon_number * 3] + + # determine sequence of variant_codon + + if is_wild_type(hgvs_nt): # variant_codon is wild-type + variant_codon = target_codon + sub_one = None # no nucleotide substitutions + elif is_deletion(hgvs_nt): # target_codon was deleted + variant_codon = None + sub_one = None # no nucleotide substitutions + elif is_substitution_one_base( + hgvs_nt + ): # variant_codon has one nucleotide substitution + # instantiate Variant object + variant = Variant(hgvs_nt) + # get index of nucleotide substitution + sub_one = int(str(variant.positions)) % 3 - 1 + # get nucleotide of substitution + sub_one_nuc = variant.sequence[1] + # set other possible indices for codon substitution to None + sub_two = None + sub_three = None + elif is_substitution_two_bases_nonadjacent( + hgvs_nt + ): # variant has two nucleotide substitutions, non-adjacent + # instantiate Variant object + variant = Variant(hgvs_nt) + # get indices of nucleotide substitutions + sub_one = int(str(variant.positions[0])) % 3 - 1 + sub_two = int(str(variant.positions[1])) % 3 - 1 + # get nucleotides of substitutions + sub_one_nuc = variant.sequence[0][1] + sub_two_nuc = variant.sequence[1][1] + # set other possible indices for codon substitution to None + sub_three = None + else: # variant_codon has two or three adjacent nucleotide substitutions + # instantiate Variant object + variant = Variant(hgvs_nt) + variant_codon = variant.sequence + # get index of first codon substitution + sub_one = int(str(variant.positions[0])) % 3 - 1 + # get string of substituted nucleotides + sub_nucs = variant.sequence + if ( + len(sub_nucs) == 2 + ): # variant codon has two adjacent nucleotide substitutions + # assign additional nucleotide substitution indices + sub_two = sub_one + 1 + # get nucleotides of substitutions + sub_one_nuc = sub_nucs[0] + sub_two_nuc = sub_nucs[1] + # set other possible indices for codon substitution to None + sub_three = None + else: # variant has three adjacent nucleotide substitutions + # assign additional nucleotide substitution indices + sub_two = sub_one + 1 + sub_three = sub_two + 1 + # get nucleotides of substitutions + sub_one_nuc = sub_nucs[0] + sub_two_nuc = sub_nucs[1] + sub_three_nuc = sub_nucs[2] + + # using data generated above (substituted nucleotides and indices in codon), construct variant_codon + + # only assign variant_codon if nucleotide substitution occurred + if sub_one is not None: + # declare and initialize variant_codon + variant_codon = "" + # set first nucleotide of variant_codon + if sub_one == 0: + variant_codon = variant_codon + sub_one_nuc + else: + variant_codon = variant_codon + target_codon[0] + # set second nucleotide of variant_codon + if sub_one == 1: + variant_codon = variant_codon + sub_one_nuc + elif sub_two == 1: + variant_codon = variant_codon + sub_two_nuc + else: + variant_codon = variant_codon + target_codon[1] + # set third nucleotide of variant_codon + if sub_one == -1 or sub_one == 2: + variant_codon = variant_codon + sub_one_nuc + elif sub_two == -1 or sub_two == 2: + variant_codon = variant_codon + sub_two_nuc + elif sub_three == -1 or sub_three == 2: + variant_codon = variant_codon + sub_three_nuc + else: + variant_codon = variant_codon + target_codon[2] + + # add values for target_codon, codon_number, and variant_codon to this row + return target_codon, codon_number, variant_codon + + +def is_wild_type(hgvs): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was no change from the target sequence. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + wt : bool + True if hgvs string indicates wild type + """ + wt = False + if hgvs.startswith("_wt"): + wt = True + return wt + + +def is_deletion(hgvs): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was a deletion. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + deletion : bool + True if hgvs string is indicates a deletion + """ + deletion = False + if hgvs.endswith("del"): + deletion = True + return deletion + + +def is_substitution_one_base(hgvs): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there was a substitution at one base of the codon. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + sub_one : bool + True if hgvs string is indicates a substitution at one base of codon + """ + sub_one = False + if hgvs[-2] == ">": + sub_one = True + return sub_one + + +def is_substitution_two_bases_nonadjacent(hgvs): + """ + This function takes an hgvs formatted string and returns True if the hgvs string indicates + there were substitutions (non-adjacent) in the codon. + + Parameters + ---------- + hgvs : string + hgvs formatted string + + Returns + ------- + sub_two : bool + True if hgvs string is indicates a substitution at one base of codon + """ + sub_two = False + if hgvs[-1] == "]": + sub_two = True + return sub_two From 546321c94de9de1568e421e645d8dadfbf6d936c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:42:39 -0700 Subject: [PATCH 708/877] add test cases for utilities functions --- tests/validation/utilities.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/validation/utilities.py diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py new file mode 100644 index 0000000..e69de29 From 48cc6cef6e3d52d7ae9b9644c7aba203d404fce3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:56:58 -0700 Subject: [PATCH 709/877] outline test cases for utilities functions --- tests/validation/utilities.py | 77 +++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index e69de29..fc13448 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -0,0 +1,77 @@ +from unittest import TestCase + +from mavecore.validation.utilities import ( + is_null, + generate_hgvs, + construct_hgvs_pro, + convert_hgvs_nt_to_hgvs_pro, + is_wild_type, + is_deletion, + is_substitution_one_base, + is_substitution_two_bases_nonadjacent +) + + +class TestIsNull(TestCase): + def valid_null_values(self): + pass + + def invalid_null_values(self): + pass + + +class TestGenerateHgvsPro(TestCase): + def test_pro(self): + pass + + def test_nt(self): + pass + + +class TestConstructHgvsPro(TestCase): + def valid_arguments(self): + pass + + def invalid_wt_aa(self): + pass + + def invalid_mut_aa(self): + pass + + def invalid_position(self): + pass + + +class TestConvertHgvsNtToHgvsPro(TestCase): + def invalid_hgvs_nt(self): + pass + + def wt_hgvs_nt(self): + pass + + def deletion_hgvs_nt(self): + pass + + def one_base_change_codon_variant(self): + pass + + def two_base_change_codon_variant(self): + pass + + def three_base_change_codon_variant(self): + pass + + +class TestVariantTypeHelperFunctions(TestCase): + + def test_is_wild_type(self): + pass + + def is_deletion(self): + pass + + def test_is_substitution_one_base(self): + pass + + def test_is_substitution_two_bases_nonadjacent(self): + pass From 00c1d40bc09e590d704d905d17a8f38842d81a68 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:57:10 -0700 Subject: [PATCH 710/877] edit imports --- mavecore/validation/utilities.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 390084f..d337407 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -3,8 +3,7 @@ from mavehgvs.variant import Variant from mavecore.validation.variant import validate_hgvs_string -from mavecore.validation.constants.conversion import aa_dict_key_3, codon_dict_DNA -#from mavetools.mavedf.mutation_type import * +from mavecore.validation.constants.conversion import codon_dict_DNA def is_null(value): From dbb14050a04fb7521e05e3b281acecdd3fc98af7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:57:42 -0700 Subject: [PATCH 711/877] edit function signature and write documentation for construct_hgvs_pro --- mavecore/validation/utilities.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index d337407..2e5eeb3 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -51,8 +51,26 @@ def generate_hgvs(prefix: str = "c") -> str: return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" -def construct_hgvs_pro(wt, mutant, position: int): - #if pd.isna(position): return None +def construct_hgvs_pro(wt: str, mutant: str, position: int): + """ + Given the wt and mutant codons as well as the position, this function generates a validated + hgvs_pro string. + + Parameters + __________ + wt: str + The wt codon. + mutant: str + The mutant codon. + position: int + The position of the change. + + Returns + _______ + hgvs + The constructed hgvs_pro string. + """ + # TODO account for when variant codon is None, a deletion event if wt == mutant: hgvs = "p." + wt + str(position) + "=" else: From 8d1f356b5a639f95c46601ed7086feaefe61df41 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:57:55 -0700 Subject: [PATCH 712/877] remove comment --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 2e5eeb3..105ad38 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -77,7 +77,7 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int): hgvs = "p." + wt + str(position) + mutant # validate variant validate_hgvs_string(hgvs) - return hgvs #, hgvs_validate + return hgvs def get_codon_data_from_nt_variants(hgvs_nt, target_seq): From 93ac86d48ca006e1e5eb24bb285561f1d4de719e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:58:38 -0700 Subject: [PATCH 713/877] edit function signature and update documentation for convert_hgvs_nt_to_hgv_pro --- mavecore/validation/utilities.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 105ad38..feff2dd 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -80,23 +80,25 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int): return hgvs -def get_codon_data_from_nt_variants(hgvs_nt, target_seq): +def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): + # TODO note that this only works for codon changes and single mutants """ - This method takes in a target sequence and converts coding variants into codon changes. - These changes are stored in three additional MaveDf columns: target_codon, codon_number, and variant_codon. - This method also updates the hgvs from legacy to mave hgvs, if it is not already done. + This function takes a hgvs_nt variant string and its associated target sequence and returns + a validated hgvs_pro equivalent. Parameters __________ - target_seq : string - target sequence + hgvs_nt: string + The hgvs_nt string that will be converted. + target_seq: + The target sequence associated with the hgvs_nt variant. Raises ______ TypeError - if target_seq is not string + If target_seq is not string. ValueError - if target_seq is not made solely of characters ACTG + If target_seq is not made solely of characters ACTG. """ # check for TypeError # if target_seq is not string From 972662ef829db43600eda81fedfab0cfd9dbe53e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:58:51 -0700 Subject: [PATCH 714/877] update error messages --- mavecore/validation/utilities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index feff2dd..24d9fce 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -103,13 +103,13 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): # check for TypeError # if target_seq is not string if not isinstance(target_seq, str): - raise TypeError("target_seq must be string") + raise TypeError("target_seq must be string.") # check for ValueError # if target_seq is not made solely of characters ACTG check_chars = [letter in "ACTG" for letter in target_seq] if False in check_chars: - raise ValueError("target_seq is invalid") + raise ValueError("target_seq is invalid, must be composed only of bases ACTG.") # identify variant_position and get codon_number associated with it From 9d4a283289d528bcb5e8804d95581ebd8feab17b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:59:18 -0700 Subject: [PATCH 715/877] declare variables ouside of if else statements --- mavecore/validation/utilities.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 24d9fce..de64aee 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -131,6 +131,18 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): # use codon_number to get target_codon from target_seq target_codon = target_seq[(codon_number - 1) * 3 : codon_number * 3] + # declare variables for codon data + # keep track of the number and location of the changes within the codon + sub_one = None + sub_two = None + sub_three = None + # keep tack of the number and value of the changes within the codon + sub_one_nuc = None + sub_two_nuc = None + sub_three_nuc = None + # keep track of the full codon changes + variant_codon = None + # determine sequence of variant_codon if is_wild_type(hgvs_nt): # variant_codon is wild-type From 385181a8a19d8124afcfa7c372656c605a2bece6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 11:59:48 -0700 Subject: [PATCH 716/877] refactor code and edit return value of conversion function --- mavecore/validation/utilities.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index de64aee..fbaa6c2 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -231,8 +231,14 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): else: variant_codon = variant_codon + target_codon[2] - # add values for target_codon, codon_number, and variant_codon to this row - return target_codon, codon_number, variant_codon + # convert to 3 letter amino acid code + target_aa = codon_dict_DNA[target_codon] + if variant_codon: + variant_aa = codon_dict_DNA[variant_codon] + else: + variant_aa = None + + return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number) def is_wild_type(hgvs): From 639cdbf3137742b0e6bf57a6a941bf85b2428c9b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:00:12 -0700 Subject: [PATCH 717/877] delete comment --- tests/validation/dataframe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index df1a286..03bda74 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -6,10 +6,6 @@ """ from io import BytesIO, StringIO -from unittest import TestCase - - -import pandas as pd from mavecore.validation import constants From cc045de3ffabaab155d8f400aa7a7385ac8f10dd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:01:02 -0700 Subject: [PATCH 718/877] delete redundant unittests --- tests/validation/dataframe.py | 124 ---------------------------------- 1 file changed, 124 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 03bda74..a53f8ca 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -252,100 +252,7 @@ def mock_return_value(data, index=None): df.index = pd.Index(df[index]) return df''' - def test_invalid_row_hgvs_is_not_a_string(self): - '''data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_invalid_missing_hgvs_columns(self): - '''data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_replaces_null_with_none_in_secondary_hgvs_column(self): - '''hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{},{}\n{},{},1.0 ".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.HGVS_PRO_COL]), [None] - )''' - - def test_replaces_null_with_none_in_numeric_columns(self): - '''hgvs_nt = generate_hgvs(prefix="c") - for c in constants.null_values_list: - with self.subTest(msg=f"'{c}'"): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - list(dataset.data(serializable=True)[self.SCORE_COL]), [None] - )''' - - def test_invalid_null_values_in_header(self): - '''for value in constants.null_values_list: - with self.subTest(msg=f"'{value}'"): - data = "{},{},{}\n{},1.0,1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs() - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_invalid_no_additional_columns_outside_hgvs_ones(self): - '''data = "{},{},{}\n{},{},{}".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_scores_missing_scores_column(self): - '''data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' def test_invalid_missing_either_required_hgvs_column(self): '''data = "{},{}\n{},{}".format( @@ -359,40 +266,9 @@ def test_invalid_missing_either_required_hgvs_column(self): self.assertEqual(len(dataset.errors), 1) print(dataset.errors)''' - def test_empty_no_variants_parsed(self): - '''data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - self.assertTrue(dataset.is_empty) - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - def test_error_non_numeric_values_in_score_column(self): - '''data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - "I am not a number", - ) - - with self.assertRaises(ValueError): - MaveDataset.for_scores(StringIO(data))''' - - def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): '''hgvs = generate_hgvs(prefix="p") From c553ccee54aab1f36aa222f7e183860f90eab7c4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:01:23 -0700 Subject: [PATCH 719/877] add test cases for validating datasets --- tests/validation/dataframe.py | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index a53f8ca..290599d 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -135,6 +135,80 @@ def test_invalid_variants(self): pass +class TestValidateValuesByColumn(TestCase): + def setUp(self): + self.target_seq = "ACA" + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G"], + hgvs_pro_column: ["p.Thr1Ala"], + hgvs_splice_column: ["c.1A>G"], + required_score_column: [1.000], + } + ) + + def test_non_numeric_values_in_score_column(self): + self.dataframe[required_score_column][0] = "not a float" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + '''data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + "I am not a number", + ) + + with self.assertRaises(ValueError): + MaveDataset.for_scores(StringIO(data))''' + #pass + + def test_invalid_row_hgvs_is_not_a_string(self): + '''data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + pass + + def test_empty_no_variants_parsed(self): + '''data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_empty) + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + pass + + def test_invalid_same_hgvs_nt_defined_in_two_rows(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},1.0\n{},1.0".format( + self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + pass + + +class TestValidateScore(TestCase): + def test_valid_score(self): + validate_score(1.1) + + def test_invalid_score(self): + with self.assertRaises(ValidationError): + validate_score("a") + + class TestVariantsMatchHgvsColumnNames(TestCase): def test_valid(self): pass From fb5e2b02b8672c523cf9f7baaf02c74a2352a488 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:01:33 -0700 Subject: [PATCH 720/877] delete comments --- tests/validation/dataframe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 290599d..32d6651 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -259,16 +259,12 @@ def test_counts_defines_different_pro_variants(self): """ from io import StringIO -import unittest -from unittest import TestCase -from random import choice import pandas as pd from pandas.testing import assert_index_equal # from dataset import constants from mavecore.validation import constants -from mavecore.validation.exceptions import ValidationError from mavecore.validation.variant_validators import ( MaveDataset, From 0c145804ee0d8c5944c9f36b8efa7adc110875ed Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:01:45 -0700 Subject: [PATCH 721/877] move code to utilities --- tests/validation/dataframe.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 32d6651..fd0de67 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -271,31 +271,6 @@ def test_counts_defines_different_pro_variants(self): ) """ -def generate_hgvs(prefix: str = "c") -> str: - """'''Generates a random hgvs string from a small sample.'''""" - '''if prefix == "p": - # Subset of 3-letter codes, chosen at random. - amino_acids = [ - "Ala", - "Leu", - "Gly", - "Val", - "Tyr", - "Met", - "Cys", - "His", - "Glu", - "Phe", - ] - ref = choice(amino_acids) - alt = choice(amino_acids) - return f"{prefix}.{ref}{choice(range(1, 100))}{alt}" - else: - alt = choice("ATCG") - ref = choice("ATCG") - return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}"''' - - class TestMaveDataset(TestCase): """ From 93a69dbe9f1c19d03f00e64548f9bd131840220f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:02:12 -0700 Subject: [PATCH 722/877] edit imports --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 96bce17..7ea3a11 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -4,7 +4,7 @@ from mavecore.validation.constants.general import * from mavecore.validation.exceptions import ValidationError from mavecore.validation.variant import validate_hgvs_string -from mavecore.validation.utilities import construct_hgvs_pro, get_codon_data_from_nt_variants +from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro from mavecore.validation.constants.conversion import codon_dict_DNA From b8f5c58df83495fa9131824b3b751f47731e4073 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:02:30 -0700 Subject: [PATCH 723/877] delete TODO edit comments --- mavecore/validation/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 7ea3a11..88fa6c7 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -94,9 +94,8 @@ def validate_column_names(columns, scores=True): # there should be at least one additional column beyond the hgvs columns if len(columns) == count: raise ValidationError("There must be at least one additional column beyond the hgvs columns.") - # validate against UTF-8byte ordering marks - # TODO if dataframe is a scores df make sure it has a score column - # also make sure counts df has a counts column + # if dataframe is a scores df make sure it has a score column + # also make sure counts df has a counts column and not a score column if scores and not score_column: raise ValidationError("A scores dataframe must include a `score` column.") if not scores and score_column: From f70ee4371972674df85bfe8f3436c0e034c0c467 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:03:23 -0700 Subject: [PATCH 724/877] update validation function implementation --- mavecore/validation/dataframe.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 88fa6c7..b7563db 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -188,13 +188,7 @@ def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq, nt, pro, row ValidationError If any of the variants within each column do not represent the same change. """ - # first get data from nt variant - target_codon, codon_number, variant_codon = get_codon_data_from_nt_variants(hgvs_nt=nt, target_seq=target_seq) - # convert to 3 letter amino acid code - target_aa = codon_dict_DNA[target_codon] - variant_aa = codon_dict_DNA[variant_codon] - # then construct a protein variant from the data - nt_converted = construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number) + nt_converted = convert_hgvs_nt_to_hgvs_pro(nt, target_seq) # compare nt_converted with pro if nt_converted != pro: raise ValidationError("The hgvs_nt variant {} and the hgvs_pro variant {} on row {} do not represent the " From 69fe2078f782808ba25cb5e57a77a4ace152f15a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:41:19 -0700 Subject: [PATCH 725/877] add builtin python typing to function signatures --- mavecore/validation/dataframe.py | 6 +++--- mavecore/validation/identifier.py | 28 ++++++++++++++-------------- mavecore/validation/urn.py | 10 +++++----- mavecore/validation/validate.py | 2 +- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index b7563db..5605081 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -8,7 +8,7 @@ from mavecore.validation.constants.conversion import codon_dict_DNA -def validate_dataframes(target_seq, scores, counts=None): +def validate_dataframes(target_seq: str, scores, counts=None): """ Validates scores and counts dataframes for MaveDB upload. This function performs comprehensive validation. @@ -103,7 +103,7 @@ def validate_column_names(columns, scores=True): "column in a scores dataframe.") -def validate_values_by_column(dataset, target_seq): +def validate_values_by_column(dataset, target_seq: str): """ Validates a string of variants and verifies that the variant type in the column name makes sense with regards to the actual variants. @@ -171,7 +171,7 @@ def validate_score(score): ) -def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq, nt, pro, row): +def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str, pro: str, row: int): """ Checks that, when two or more of hgvs_nt, hgvs_pro, and hgvs_splice columns exist, the variant strings within those columns are representing the same change. diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index 49b2412..4f5bc1e 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -5,7 +5,7 @@ from mavecore.validation.constants.identifier import valid_dbnames -def validate_external_identifier(identifier): +def validate_external_identifier(identifier: dict): """ Validates an external identifier represented as a dictionary. The dictionary should have a length of 2 and have the keys `dbname` and `identifier`, both with str values. The valid values for these keys are @@ -47,7 +47,7 @@ def validate_external_identifier(identifier): # TODO add other conditions like the one above -def validate_sra_identifier(identifier): +def validate_sra_identifier(identifier: str): """ Validates whether the identifier is a valid SRA identifier. @@ -74,7 +74,7 @@ def validate_sra_identifier(identifier): ) -def validate_pubmed_identifier(identifier): +def validate_pubmed_identifier(identifier: str): """ Validates whether the identifier is a valid PubMed identifier. @@ -93,7 +93,7 @@ def validate_pubmed_identifier(identifier): raise ValidationError("{} is not a valid PubMed identifier.".format(identifier)) -def validate_doi_identifier(identifier): +def validate_doi_identifier(identifier: str): """ Validates whether the identifier is a valid DOI identifier. @@ -112,7 +112,7 @@ def validate_doi_identifier(identifier): raise ValidationError("{} is not a valid DOI identifier.".format(identifier)) -def validate_ensembl_identifier(identifier): +def validate_ensembl_identifier(identifier: str): """ Validates whether the identifier is a valid Ensembl identifier. @@ -130,7 +130,7 @@ def validate_ensembl_identifier(identifier): raise ValidationError(f"'{identifier}' is not a valid Ensembl accession.") -def validate_uniprot_identifier(identifier): +def validate_uniprot_identifier(identifier: str): """ Validates whether the identifier is a valid UniProt identifier. @@ -148,7 +148,7 @@ def validate_uniprot_identifier(identifier): raise ValidationError(f"'{identifier}' is not a valid UniProt accession.") -def validate_refseq_identifier(identifier): +def validate_refseq_identifier(identifier: str): """ Validates whether the identifier is a valid RefSeq identifier. @@ -166,7 +166,7 @@ def validate_refseq_identifier(identifier): raise ValidationError(f"'{identifier}' is not a valid RefSeq accession.") -def validate_genome_identifier(identifier): +def validate_genome_identifier(identifier: str): """ Validates whether the identifier is a valid genome identifier. @@ -186,7 +186,7 @@ def validate_genome_identifier(identifier): ) -def validate_pubmed_list(values): +def validate_pubmed_list(values: list[str]): """ Validates whether each identifier in a list of identifiers (values) is a valid PubMed identifier. @@ -205,7 +205,7 @@ def validate_pubmed_list(values): validate_pubmed_identifier(value) -def validate_sra_list(values): +def validate_sra_list(values: list[str]): """ Validates whether each identifier in a list of identifiers (values) is a valid SRA identifier. @@ -224,7 +224,7 @@ def validate_sra_list(values): validate_sra_identifier(value) -def validate_doi_list(values): +def validate_doi_list(values: list[str]): """ Validates whether each identifier in a list of identifiers (values) is a valid DOI identifier. @@ -243,7 +243,7 @@ def validate_doi_list(values): validate_doi_identifier(value) -def validate_ensembl_list(values): +def validate_ensembl_list(values: list[str]): """ Validates whether each identifier in a list of identifiers (values) is a valid Ensembl identifier. @@ -262,7 +262,7 @@ def validate_ensembl_list(values): validate_ensembl_identifier(value) -def validate_refseq_list(values): +def validate_refseq_list(values: list[str]): """ Validates whether each identifier in a list of identifiers (values) is a valid RefSeq identifier. @@ -281,7 +281,7 @@ def validate_refseq_list(values): validate_refseq_identifier(value) -def validate_uniprot_list(values): +def validate_uniprot_list(values: list[str]): """ Validates whether each identifer in a list of identifiers (values) is a valid UniProt identifier. diff --git a/mavecore/validation/urn.py b/mavecore/validation/urn.py index d8f3d96..2f5a52b 100644 --- a/mavecore/validation/urn.py +++ b/mavecore/validation/urn.py @@ -3,7 +3,7 @@ from mavecore.validation.constants.urn import * -def validate_mavedb_urn(urn): +def validate_mavedb_urn(urn: str): """ This function validates a MaveDB urn and raises an error if it is not valid. @@ -29,7 +29,7 @@ def validate_mavedb_urn(urn): raise ValidationError("{}'s is not a valid urn.".format(urn)) -def validate_mavedb_urn_experimentset(urn): +def validate_mavedb_urn_experimentset(urn: str): """ This function validates a Experiment Set urn and raises an error if it is not valid. @@ -57,7 +57,7 @@ def validate_mavedb_urn_experimentset(urn): ) -def validate_mavedb_urn_experiment(urn): +def validate_mavedb_urn_experiment(urn: str): """ This function validates an Experiment urn and raises an error if it is not valid. @@ -85,7 +85,7 @@ def validate_mavedb_urn_experiment(urn): ) -def validate_mavedb_urn_scoreset(urn): +def validate_mavedb_urn_scoreset(urn: str): """ This function validates a Scoreset urn and raises an error if it is not valid. @@ -111,7 +111,7 @@ def validate_mavedb_urn_scoreset(urn): raise ValidationError("{}'s is not a valid score set urn.".format(urn)) -def validate_mavedb_urn_variant(urn): +def validate_mavedb_urn_variant(urn: str): """ This function validates a MaveDB Variant urn and raises an error if it is not valid. diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index e699889..8828797 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -3,7 +3,7 @@ from mavecore.validation.dataframe import validate_dataframes -def validate(dataset, dataset_type, scores=None, counts=None): +def validate(dataset: dict, dataset_type: str, scores=None, counts=None): """ This function validates data to by uploaded to MaveDB. Descriptive errors will be raised if any of the validation fails. Scores and counts are optional as this function accepts both experiments and scoresets. From e2f3a688c0383c560d400680468f256ff9ec953e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:41:54 -0700 Subject: [PATCH 726/877] edit imports --- mavecore/validation/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 5605081..a542292 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -1,11 +1,10 @@ from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal -from mavehgvs import Variant + from mavecore.validation.constants.general import * from mavecore.validation.exceptions import ValidationError -from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.variant import validate_nt_variant, validate_pro_variant, validate_splice_variant from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro -from mavecore.validation.constants.conversion import codon_dict_DNA def validate_dataframes(target_seq: str, scores, counts=None): From c3e9e2ad2ef0e6d5cb01852bb17ed1e1ccbb9064 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:42:36 -0700 Subject: [PATCH 727/877] update validate_values_by_column documentation --- mavecore/validation/dataframe.py | 39 ++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index a542292..820f35f 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -104,21 +104,30 @@ def validate_column_names(columns, scores=True): def validate_values_by_column(dataset, target_seq: str): """ - Validates a string of variants and verifies that the variant type in the column name makes - sense with regards to the actual variants. - - Parameters - __________ - variants: list[str] - List of mavehgvs formatted strings. - column_name: str - The hgvs column name from which the variants parameter originates. - - Raises - ______ - ValidationError - If any variant in the list of variants does not adhere to the mavehgvs specifications. - """ + Validates that the values in each column labeled `hgvs_nt`, `hgvs_pro`, `hgvs_splice`, and `score` make sense + with regards to their column name. It also validates via a helper function that if both an `hgvs_nt` column and + an `hgvs_pro` column exist, they are consistent with one another. + + Parameters + __________ + dataset: pandas.DataFrame + A scores or counts dataframe. + target_seq: str + The hgvs column name from which the variants parameter originates. + + Raises + ______ + ValidationError + If the target sequence does not contain solely the bases ACTG. + ValidationError + If any variant fails validation or if the variants are not consistent with one another. + """ + # check for ValueError + # if target_seq is not made solely of characters ACTG + check_chars = [letter in "ACTG" for letter in target_seq] + if False in check_chars: + raise ValidationError("target_seq is invalid, must be composed only of bases ACTG.") + # first check the column names, establish the order or the hgvs and score columns hgvs_nt = False hgvs_pro = False From 9db8b324788f15ccaccc409a297ce959cbcb4fd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:43:01 -0700 Subject: [PATCH 728/877] edit function calls for hgvs string validation --- mavecore/validation/dataframe.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 820f35f..4664142 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -148,17 +148,14 @@ def validate_values_by_column(dataset, target_seq: str): # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another for i in range(len(dataset)): if hgvs_nt: - validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], - column="nt", - targetseq=target_seq) + validate_nt_variant(value=dataset.loc[i, hgvs_nt_column], + targetseq=target_seq) if hgvs_pro: - validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], - column="p", + validate_pro_variant(value=dataset.loc[i, hgvs_pro_column], targetseq=target_seq) if hgvs_splice: - validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], - column="splice", - targetseq=target_seq) + validate_splice_variant(value=dataset.loc[i, hgvs_splice_column], + targetseq=target_seq) if score: validate_score(dataset.loc[i, required_score_column]) if hgvs_nt and hgvs_pro: From 2aa3e90d1acf925d12e48d8dd4ba825a97adde59 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:43:16 -0700 Subject: [PATCH 729/877] mark TODO --- mavecore/validation/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 4664142..a2950dd 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -168,7 +168,8 @@ def validate_values_by_column(dataset, target_seq: str): # no protein target with just nt variants -def validate_score(score): +def validate_score(score: float): + # TODO we probably dont need this if type(score) != float: raise ValidationError( "Each value in score column must by a float. " From 4be7919d8a692215790b87f38a52c301581680fe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:43:40 -0700 Subject: [PATCH 730/877] update documentation for validating variants represent the same change --- mavecore/validation/dataframe.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index a2950dd..855c0d5 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -179,20 +179,22 @@ def validate_score(score: float): def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str, pro: str, row: int): """ - Checks that, when two or more of hgvs_nt, hgvs_pro, and hgvs_splice columns exist, the variant strings within + Checks that, when both an `hgvs_nt` and an `hgvs_pro` exist, the variant strings within those columns are representing the same change. Parameters __________ - nt: list - The hgvs_nt column represented as a list. + target_seq: str + The target sequence associated withe variants. + nt: str + The hgvs_nt string. pro: list - The hgvs_pro column represented as a list. + The hgvs_pro string. Raises ______ ValidationError - If any of the variants within each column do not represent the same change. + If the variants do not represent the same change. """ nt_converted = convert_hgvs_nt_to_hgvs_pro(nt, target_seq) # compare nt_converted with pro From 478966ec4b531ddabbf1205b064b4b05b04c38f4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:44:00 -0700 Subject: [PATCH 731/877] add values to valid_dbnames --- mavecore/validation/constants/identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/constants/identifier.py b/mavecore/validation/constants/identifier.py index b55c8ca..cc46004 100644 --- a/mavecore/validation/constants/identifier.py +++ b/mavecore/validation/constants/identifier.py @@ -1 +1 @@ -valid_dbnames = ["uniprot"] \ No newline at end of file +valid_dbnames = ["UniProt", "RefSeq", "Ensembl"] \ No newline at end of file From c3bbd1366ad2dd54e4a81e658c93b5d8d464e445 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:44:25 -0700 Subject: [PATCH 732/877] update parameter types in validation documentation --- mavecore/validation/identifier.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index 4f5bc1e..6809141 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -192,7 +192,7 @@ def validate_pubmed_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises @@ -211,7 +211,7 @@ def validate_sra_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises @@ -230,7 +230,7 @@ def validate_doi_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises @@ -249,7 +249,7 @@ def validate_ensembl_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises @@ -268,7 +268,7 @@ def validate_refseq_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises @@ -287,7 +287,7 @@ def validate_uniprot_list(values: list[str]): Parameters __________ - identifier: List[str] + identifier: list[str] The list of identifiers to be validated. Raises From 9be09d081ad98275962bfdf99891bec6c51fe066 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:44:39 -0700 Subject: [PATCH 733/877] add typing to function signature --- mavecore/validation/keywords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/keywords.py b/mavecore/validation/keywords.py index 9d83abc..04da4c5 100644 --- a/mavecore/validation/keywords.py +++ b/mavecore/validation/keywords.py @@ -10,7 +10,7 @@ def validate_keywords(v): validate_keyword(keyword) -def validate_keyword(keyword): +def validate_keyword(keyword: str): """ This function validates whether or not the kw parameter is valid by checking that it is a string that is not null. If kw is null From 7870b76096784e6eef365c5cf0450d6791e18874 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:45:23 -0700 Subject: [PATCH 734/877] update function signature and docstring, and refactor implementation for validating keywords --- mavecore/validation/keywords.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/keywords.py b/mavecore/validation/keywords.py index 04da4c5..2d8cd8f 100644 --- a/mavecore/validation/keywords.py +++ b/mavecore/validation/keywords.py @@ -2,11 +2,26 @@ from mavecore.validation.utilities import is_null -def validate_keywords(v): - if is_null(v): - raise ValidationError("{} are not valid keywords. Keywords must be a non null list of strings.".format(v)) +def validate_keywords(keywords: list[str]): + """ + Validates a list of keywords. + + Parameters + __________ + keywords: list[str] + A list of keywords. + + Raises + ______ + ValidationError + If the list is invalid or null or if any individual keyword is invalid or null. + """ + if is_null(keywords): + raise ValidationError( + "{} are not valid keywords. Keywords must be a non null list of strings.".format(keywords) + ) else: - for keyword in v: + for keyword in keywords: validate_keyword(keyword) From 58670ac51ddb8a56a334b2c5ae8c474348fe29a5 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:45:50 -0700 Subject: [PATCH 735/877] write documentation and include python typing for category validation functions --- mavecore/validation/target.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/target.py b/mavecore/validation/target.py index 21039c9..3991f31 100644 --- a/mavecore/validation/target.py +++ b/mavecore/validation/target.py @@ -2,13 +2,39 @@ from mavecore.validation.constants.target import valid_categories, valid_sequence_types -def validate_target_category(category): +def validate_target_category(category: str): + """ + If the target category provided does not fall within a pre-defined list of valid categories. + + Parameters + __________ + category: str + The target category to be validated. + + Raises + ______ + ValidationError + If the target category provided is not valid. + """ if category not in valid_categories: raise ValidationError("{}'s is not a valid target category. Valid categories are " "Protein coding, Regulatory, and Other noncoding".format(category)) -def validate_sequence_category(sequence_type): +def validate_sequence_category(sequence_type: str): + """ + If the sequence type provided does not fall within a pre-defined list of valid sequence types. + + Parameters + __________ + sequence_type: str + The sequence type to be validated. + + Raises + ______ + ValidationError + If the sequence type provided is not valid. + """ if sequence_type not in valid_sequence_types: raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " "Infer, DNA, and Protein".format(sequence_type)) From 868245c1d20d6fc52d981c594c11ef1370d5e4ed Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:46:15 -0700 Subject: [PATCH 736/877] update function signatures for helper functions --- mavecore/validation/utilities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index fbaa6c2..062e1db 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -241,7 +241,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number) -def is_wild_type(hgvs): +def _is_wild_type(hgvs): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was no change from the target sequence. @@ -262,7 +262,7 @@ def is_wild_type(hgvs): return wt -def is_deletion(hgvs): +def _is_deletion(hgvs): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was a deletion. @@ -283,7 +283,7 @@ def is_deletion(hgvs): return deletion -def is_substitution_one_base(hgvs): +def _is_substitution_one_base(hgvs): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was a substitution at one base of the codon. @@ -304,7 +304,7 @@ def is_substitution_one_base(hgvs): return sub_one -def is_substitution_two_bases_nonadjacent(hgvs): +def _is_substitution_two_bases_nonadjacent(hgvs): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there were substitutions (non-adjacent) in the codon. From e1785eb534a86c6463fd469a8e2dcfdcbb2ed087 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:46:47 -0700 Subject: [PATCH 737/877] refactor code to reflect changes in function signatures --- mavecore/validation/utilities.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 062e1db..40ea41f 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -113,7 +113,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): # identify variant_position and get codon_number associated with it - if is_wild_type(hgvs_nt): # variant_codon is wild-type + if _is_wild_type(hgvs_nt): # variant_codon is wild-type codon_number = None target_codon = None else: # any other variant change @@ -145,13 +145,13 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): # determine sequence of variant_codon - if is_wild_type(hgvs_nt): # variant_codon is wild-type + if _is_wild_type(hgvs_nt): # variant_codon is wild-type variant_codon = target_codon sub_one = None # no nucleotide substitutions - elif is_deletion(hgvs_nt): # target_codon was deleted + elif _is_deletion(hgvs_nt): # target_codon was deleted variant_codon = None sub_one = None # no nucleotide substitutions - elif is_substitution_one_base( + elif _is_substitution_one_base( hgvs_nt ): # variant_codon has one nucleotide substitution # instantiate Variant object @@ -163,7 +163,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): # set other possible indices for codon substitution to None sub_two = None sub_three = None - elif is_substitution_two_bases_nonadjacent( + elif _is_substitution_two_bases_nonadjacent( hgvs_nt ): # variant has two nucleotide substitutions, non-adjacent # instantiate Variant object From f646c31d0442202b4be6bc3aea21410ac2f5df5b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:47:17 -0700 Subject: [PATCH 738/877] add values to __all__ --- mavecore/validation/variant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/variant.py b/mavecore/validation/variant.py index adf785d..2d785d0 100644 --- a/mavecore/validation/variant.py +++ b/mavecore/validation/variant.py @@ -10,7 +10,7 @@ hgvs_pro_column, ) -__all__ = ["validate_hgvs_string"] +__all__ = ["validate_hgvs_string", "validate_nt_variant", "validate_pro_variant", "validate_splice_variant"] from mavecore.validation.utilities import is_null From 5f95a13b1832804e136430470fc4af097981291a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:47:45 -0700 Subject: [PATCH 739/877] reformat file --- mavecore/validation/constants/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/constants/target.py b/mavecore/validation/constants/target.py index 0e83872..6183e2a 100644 --- a/mavecore/validation/constants/target.py +++ b/mavecore/validation/constants/target.py @@ -1,2 +1,2 @@ valid_categories = ["Protein coding", "Regulatory", "Other noncoding"] -valid_sequence_types = ["Infer", "DNA", "Protein"] \ No newline at end of file +valid_sequence_types = ["Infer", "DNA", "Protein"] From 3c7069fe71b26ceea1af12211482f299ed346421 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 10:54:27 -0700 Subject: [PATCH 740/877] reformat file --- mavecore/validation/constants/identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/constants/identifier.py b/mavecore/validation/constants/identifier.py index cc46004..100dedc 100644 --- a/mavecore/validation/constants/identifier.py +++ b/mavecore/validation/constants/identifier.py @@ -1 +1 @@ -valid_dbnames = ["UniProt", "RefSeq", "Ensembl"] \ No newline at end of file +valid_dbnames = ["UniProt", "RefSeq", "Ensembl"] From 46d2446c8a6c3a9717e92fbb193e569f346eb4ca Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 10:55:46 -0700 Subject: [PATCH 741/877] reformat docstrings --- mavecore/validation/dataframe.py | 20 ++++++++++---------- mavecore/validation/identifier.py | 28 ++++++++++++++-------------- mavecore/validation/validate.py | 8 ++++---- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 855c0d5..8771f46 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -14,9 +14,9 @@ def validate_dataframes(target_seq: str, scores, counts=None): Parameters __________ - scores: pandas.DataFrame + scores : pandas.DataFrame The scores data as a pandas dataframe. - counts: pandas.DataFrame + counts : pandas.DataFrame The counts data as a pandas dataframe. Raises @@ -41,7 +41,7 @@ def validate_no_null_columns_or_rows(dataframe): Parameters __________ - dataframe: pandas.DataFrame + dataframe : pandas.DataFrame The scores or counts dataframe being validated Raises @@ -68,7 +68,7 @@ def validate_column_names(columns, scores=True): Parameters __________ - dataframe: pandas.DataFrame + dataframe : pandas.DataFrame The scores or counts dataframe to be validated. Raises @@ -110,7 +110,7 @@ def validate_values_by_column(dataset, target_seq: str): Parameters __________ - dataset: pandas.DataFrame + dataset : pandas.DataFrame A scores or counts dataframe. target_seq: str The hgvs column name from which the variants parameter originates. @@ -184,11 +184,11 @@ def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str Parameters __________ - target_seq: str + target_seq : str The target sequence associated withe variants. - nt: str + nt : str The hgvs_nt string. - pro: list + pro : list The hgvs_pro string. Raises @@ -210,9 +210,9 @@ def validate_dataframes_define_same_variants(scores, counts): Parameters ---------- - scores: pandas.DataFrame + scores : pandas.DataFrame Scores dataframe parsed from an uploaded scores file. - counts: pandas.DataFrame + counts : pandas.DataFrame Scores dataframe parsed from an uploaded counts file. Raises diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index 6809141..fffa2d3 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -13,7 +13,7 @@ def validate_external_identifier(identifier: dict): Parameters __________ - identifier: dict + identifier : dict The identifier to be validated. Raises @@ -53,7 +53,7 @@ def validate_sra_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -80,7 +80,7 @@ def validate_pubmed_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -99,7 +99,7 @@ def validate_doi_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -118,7 +118,7 @@ def validate_ensembl_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -136,7 +136,7 @@ def validate_uniprot_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -154,7 +154,7 @@ def validate_refseq_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -172,7 +172,7 @@ def validate_genome_identifier(identifier: str): Parameters __________ - identifier: str + identifier : str The identifier to be validated. Raises @@ -192,7 +192,7 @@ def validate_pubmed_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises @@ -211,7 +211,7 @@ def validate_sra_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises @@ -230,7 +230,7 @@ def validate_doi_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises @@ -249,7 +249,7 @@ def validate_ensembl_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises @@ -268,7 +268,7 @@ def validate_refseq_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises @@ -287,7 +287,7 @@ def validate_uniprot_list(values: list[str]): Parameters __________ - identifier: list[str] + identifier : list[str] The list of identifiers to be validated. Raises diff --git a/mavecore/validation/validate.py b/mavecore/validation/validate.py index 8828797..9f8533c 100644 --- a/mavecore/validation/validate.py +++ b/mavecore/validation/validate.py @@ -10,13 +10,13 @@ def validate(dataset: dict, dataset_type: str, scores=None, counts=None): Parameters __________ - dataset: dict + dataset : dict The scoreset or experiment to be uploaded. This will be cast into a pydantic object. - dataset_type: str + dataset_type : str The type of dataset that the first argument is, either "experiments" or "scoresets". - scores: Pandas.DataFrame + scores : Pandas.DataFrame The scores dataframe as a Pandas DataFrame. - counts: Pandas.DataFrame + counts : Pandas.DataFrame The counts dataframe as a Pandas DataFrame. Raises From 87c3fa9c383045a8486f84d7ed890bf25803bd7e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 10:56:41 -0700 Subject: [PATCH 742/877] edit imports --- mavecore/models/map.py | 4 +--- mavecore/models/sequence.py | 2 +- mavecore/models/target.py | 2 +- mavecore/validation/dataframe.py | 11 +++++++++-- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index c68e98a..75ddb20 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -1,6 +1,4 @@ -from pydantic import BaseModel, ValidationError, validator -from datetime import datetime -from typing import Optional +from pydantic import BaseModel class ReferenceMap(BaseModel): diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index 3b2ae2f..293552b 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, ValidationError, validator +from pydantic import BaseModel, validator from mavecore.validation import target diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 409eb79..77e1a18 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, ValidationError, validator +from pydantic import BaseModel, validator from typing import List, Optional from .map import ReferenceMap diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 8771f46..9a6fb17 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -1,7 +1,14 @@ from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal - -from mavecore.validation.constants.general import * +from mavehgvs.variant import Variant + +from mavecore.validation.constants.general import ( + readable_null_values_list, + hgvs_nt_column, + hgvs_pro_column, + hgvs_splice_column, + required_score_column +) from mavecore.validation.exceptions import ValidationError from mavecore.validation.variant import validate_nt_variant, validate_pro_variant, validate_splice_variant from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro From eed29f695f33a8ff3788020e1c53bf93353ea024 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:14:19 -0700 Subject: [PATCH 743/877] reformat --- mavecore/validation/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 9a6fb17..2400ae2 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -92,7 +92,8 @@ def validate_column_names(columns, scores=True): if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: count+=1 if columns[i] == required_score_column: score_column = True # there should be at least one hgvs column - if count == 0: raise ValidationError("Must include hgvs_nt, hgvs_pro, or hgvs_splice column.") + if count == 0: + raise ValidationError("Must include hgvs_nt, hgvs_pro, or hgvs_splice column.") # first columns should be hgvs columns for i in range(count): if columns[i] not in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: From 392a0eb1aeee0a9986b735895bde477f4465e421 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:14:59 -0700 Subject: [PATCH 744/877] add condition to only perform validation when there is a protein single variant --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 2400ae2..356479f 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -166,7 +166,7 @@ def validate_values_by_column(dataset, target_seq: str): targetseq=target_seq) if score: validate_score(dataset.loc[i, required_score_column]) - if hgvs_nt and hgvs_pro: + if hgvs_nt and not Variant(hgvs_pro).is_multi_variant(): # can only convert to single hgvs_pro variants validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, nt=dataset.loc[i, hgvs_nt_column], pro=dataset.loc[i, hgvs_pro_column], From 40a5764d2acfdc6f7f1616cbb63f654437e41922 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:15:07 -0700 Subject: [PATCH 745/877] reformat --- mavecore/models/identifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index eef156f..d3672dd 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -3,6 +3,7 @@ from mavecore.validation import identifier as id + class Identifier(BaseModel): identifier: str From 37b0e4695489a9fa96ae97943e8a0844102854f8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:15:28 -0700 Subject: [PATCH 746/877] delete TODO and reformat --- mavecore/models/identifier.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index d3672dd..4b91c86 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -52,9 +52,6 @@ class ExternalIdentifier(BaseModel): identifier: dict offset: Optional[int] - # TODO validate the offset in relation to the ExternalIdentifier @validator('identifier') def validate_identifier(cls, v): id.validate_external_identifier(v) - - From b8c19c03e6900b80c9c91868d12abd3286b368c3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:16:22 -0700 Subject: [PATCH 747/877] correct placement of if else statements --- mavecore/validation/identifier.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mavecore/validation/identifier.py b/mavecore/validation/identifier.py index fffa2d3..9e6c693 100644 --- a/mavecore/validation/identifier.py +++ b/mavecore/validation/identifier.py @@ -27,24 +27,31 @@ def validate_external_identifier(identifier: dict): ValidationError If the `identifier` value is not correct as it relates to the `dbname` value. """ + # check that identifier dict only has two keys if len(identifier) != 2: raise ValidationError("The identifier attribute of the external identifier should have two keys, `dbname` " "and `identifier`.") + # check that the keys are the right name - elif "dbname" not in identifier: + if "dbname" not in identifier: raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " "and `identifier`.") - elif "identifier" not in identifier: + if "identifier" not in identifier: raise ValidationError("The identifier attribute of the external identifier should have two Keys, `dbname` " "and `identifier`.") + # check that dbname is valid - elif identifier.get("dbname") not in valid_dbnames: + if identifier.get("dbname") not in valid_dbnames: raise ValidationError(f"The `dbname` key within the identifier attribute of the external identifier should " f"take one of the following values: {valid_dbnames}.") - # validate identifier based on dbname - elif identifier.get("dbname") == "uniprot": + + # validate identifier based on dbname: could be one of UniProt, RefSeq, or Ensembl + if identifier.get("dbname") == "UniProt": validate_uniprot_identifier(identifier.get("identifier")) - # TODO add other conditions like the one above + elif identifier.get("dbname") == "RefSeq": + validate_refseq_identifier(identifier.get("identifier")) + elif identifier.get("dbname") == "Ensembl": + validate_ensembl_identifier(identifier.get("identifier")) def validate_sra_identifier(identifier: str): From 53f8cb26069c0d0c4815c960c545ae4b4df64fe8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:16:31 -0700 Subject: [PATCH 748/877] delete comment --- mavecore/models/map.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mavecore/models/map.py b/mavecore/models/map.py index 75ddb20..4b50a32 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -4,10 +4,3 @@ class ReferenceMap(BaseModel): genomeId: int targetId: int - - '''@validator('creationDate', 'modificationDate') - def date_must_match_regex(cls, v): - # regular expression for validating a date - regex = '%Y-%m-%d' - if not bool(datetime.strptime(v, regex)): - raise ValidationError("{}'s is not a valid date.".format(v))''' From 6f442bd9750c220840a13bc678f7dbb55782a869 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:16:40 -0700 Subject: [PATCH 749/877] edit import --- mavecore/validation/utilities.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 40ea41f..a92534c 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -1,5 +1,6 @@ from mavecore.validation.constants.general import null_values_re from random import choice +from typing import Optional from mavehgvs.variant import Variant from mavecore.validation.variant import validate_hgvs_string From 9145f0b1971915db1794a0a29bee2096c86a097b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:17:07 -0700 Subject: [PATCH 750/877] edit docstring to take target_seq as argument --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index a92534c..c44f7a2 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -52,7 +52,7 @@ def generate_hgvs(prefix: str = "c") -> str: return f"{prefix}.{choice(range(1, 100))}{ref}>{alt}" -def construct_hgvs_pro(wt: str, mutant: str, position: int): +def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional[str] = None): """ Given the wt and mutant codons as well as the position, this function generates a validated hgvs_pro string. From 90af7c2c75ea13282e328278b1821166053d0f4e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:17:27 -0700 Subject: [PATCH 751/877] pass optional attributes for more precise validation --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index c44f7a2..234ae74 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -77,7 +77,7 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional else: hgvs = "p." + wt + str(position) + mutant # validate variant - validate_hgvs_string(hgvs) + validate_hgvs_string(value=hgvs, column="p", targetseq=target_seq) return hgvs From 002dfc4de91fd9ffa1fbe92f95b94f5b859b352c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:17:45 -0700 Subject: [PATCH 752/877] add type hints to function signature --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 234ae74..b21149c 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -81,7 +81,7 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional return hgvs -def convert_hgvs_nt_to_hgvs_pro(hgvs_nt, target_seq): +def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): # TODO note that this only works for codon changes and single mutants """ This function takes a hgvs_nt variant string and its associated target sequence and returns From 7ac01a3ab1d0adee592b5601116a9b04828d92b9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:18:10 -0700 Subject: [PATCH 753/877] edit return value to reflect changes in function signature --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index b21149c..790dfe0 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -239,7 +239,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): else: variant_aa = None - return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number) + return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number, target_seq=target_seq) def _is_wild_type(hgvs): From ddd55e04fad8ab1558f68ebca78458a428f18984 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Thu, 29 Sep 2022 11:18:34 -0700 Subject: [PATCH 754/877] add type hints to function signatures --- mavecore/validation/utilities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 790dfe0..b1f20e2 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -242,7 +242,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): return construct_hgvs_pro(wt=target_aa, mutant=variant_aa, position=codon_number, target_seq=target_seq) -def _is_wild_type(hgvs): +def _is_wild_type(hgvs: str): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was no change from the target sequence. @@ -263,7 +263,7 @@ def _is_wild_type(hgvs): return wt -def _is_deletion(hgvs): +def _is_deletion(hgvs: str): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was a deletion. @@ -284,7 +284,7 @@ def _is_deletion(hgvs): return deletion -def _is_substitution_one_base(hgvs): +def _is_substitution_one_base(hgvs: str): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was a substitution at one base of the codon. @@ -305,7 +305,7 @@ def _is_substitution_one_base(hgvs): return sub_one -def _is_substitution_two_bases_nonadjacent(hgvs): +def _is_substitution_two_bases_nonadjacent(hgvs: str): """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there were substitutions (non-adjacent) in the codon. From 19ee4e69f0442c646280aee3fa05ab932c324973 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:31:46 -0700 Subject: [PATCH 755/877] edit imports --- mavecore/validation/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 356479f..3f4d7eb 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -10,8 +10,8 @@ required_score_column ) from mavecore.validation.exceptions import ValidationError -from mavecore.validation.variant import validate_nt_variant, validate_pro_variant, validate_splice_variant -from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro +from mavecore.validation.variant import validate_hgvs_string +from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro, is_null def validate_dataframes(target_seq: str, scores, counts=None): From 9851ec0daa28cb042eea606548277b034064ed9f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:32:07 -0700 Subject: [PATCH 756/877] edit function call to reflect changes in function signature --- mavecore/validation/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 3f4d7eb..d1161f5 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -32,11 +32,11 @@ def validate_dataframes(target_seq: str, scores, counts=None): If any of the validation fails. """ validate_no_null_columns_or_rows(scores) - validate_column_names(scores.columns) + scores = validate_column_names(scores) validate_values_by_column(scores, target_seq) if counts is not None: validate_no_null_columns_or_rows(counts) - validate_column_names(counts.columns, scores=False) + counts = validate_column_names(counts, scores=False) validate_values_by_column(counts, target_seq) validate_dataframes_define_same_variants(scores, counts) From ace1948765cdf8ef398ff5f633f8594eaa620668 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:32:23 -0700 Subject: [PATCH 757/877] update parameter name --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index d1161f5..688254d 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -64,7 +64,7 @@ def validate_no_null_columns_or_rows(dataframe): raise ValidationError("Dataset should not contain null columns or rows.") -def validate_column_names(columns, scores=True): +def validate_column_names(dataframe, scores=True): """ This function validates the columns in a dataframe. The first columns should be an hgvs column such as hgvs_nt, hgvs_pro, and hgvs_splice. There should be at least From 6357c4c59b5d1c0180be846a13217dd9f939e129 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:33:15 -0700 Subject: [PATCH 758/877] update validate_column_names implementation --- mavecore/validation/dataframe.py | 62 ++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 688254d..6e3df40 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -83,24 +83,64 @@ def validate_column_names(dataframe, scores=True): ValidationError If the column names are not formatted correctly. """ + # get columns from dataframe + columns = dataframe.columns + # TODO do one of either hgvs_pro and hgvs_nt have to be present? # count instances of hgvs columns count = 0 + # note presence of different columns + hgvs_nt = False + hgvs_pro = False + hgvs_splice = False score_column = False for i in range(len(columns)): # there should not be any null columns - if columns[i] in readable_null_values_list: raise ValidationError("Column names must not be null.") - if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: count+=1 - if columns[i] == required_score_column: score_column = True - # there should be at least one hgvs column - if count == 0: - raise ValidationError("Must include hgvs_nt, hgvs_pro, or hgvs_splice column.") - # first columns should be hgvs columns - for i in range(count): - if columns[i] not in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: - raise ValidationError("First columns must be hgvs columns.") + if is_null(columns[i]) or columns[i] is None: + raise ValidationError("Column names must not be null.") # in readable_null_values_list: + if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: + count += 1 + # mark what type of column the current column is + if columns[i] == hgvs_nt_column: + hgvs_nt = True + elif columns[i] == hgvs_pro_column: + hgvs_pro = True + elif columns[i] == hgvs_splice_column: + hgvs_splice = True + elif columns[i] == required_score_column: + score_column = True + # check for uppercase and raise error + elif (columns[i] == hgvs_nt_column.upper() or + columns[i] == hgvs_pro_column.upper() or + columns[i] == hgvs_splice_column.upper() or + columns[i] == required_score_column.upper()): + raise ValidationError("hgvs columns and score column should be lowercase.") + + # there should be at least one of hgvs_nt or hgvs_pro column + # if count == 0: + if not hgvs_nt and not hgvs_pro: + raise ValidationError("Must include hgvs_nt or hgvs_pro column.") # or hgvs_splice column.") + + # first columns should be hgvs columns, reorder columns to meet this requirement + if score_column: + score = dataframe.pop(required_score_column) + dataframe.insert(0, required_score_column, score) + if hgvs_splice: + splice_column = dataframe.pop(hgvs_splice_column) + dataframe.insert(0, hgvs_splice_column, splice_column) + if hgvs_pro: + pro_column = dataframe.pop(hgvs_pro_column) + dataframe.insert(0, hgvs_pro_column, pro_column) + if hgvs_nt: + nt_column = dataframe.pop(hgvs_nt_column) + dataframe.insert(0, hgvs_nt_column, nt_column) + #for i in range(count): + # if columns[i] not in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: + # raise ValidationError("First columns must be hgvs columns.") + # there should be at least one additional column beyond the hgvs columns if len(columns) == count: raise ValidationError("There must be at least one additional column beyond the hgvs columns.") + # if dataframe is a scores df make sure it has a score column # also make sure counts df has a counts column and not a score column if scores and not score_column: @@ -109,6 +149,8 @@ def validate_column_names(dataframe, scores=True): raise ValidationError("A counts dataframe should not include a `score` column, include `score` " "column in a scores dataframe.") + return dataframe + def validate_values_by_column(dataset, target_seq: str): """ From 72f1bef64a38ce6f7305bb5c85d9943dc2bc1618 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:33:31 -0700 Subject: [PATCH 759/877] check for empty dataset --- mavecore/validation/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 6e3df40..afb2ef0 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -172,6 +172,10 @@ def validate_values_by_column(dataset, target_seq: str): ValidationError If any variant fails validation or if the variants are not consistent with one another. """ + # first check that dataframe is not empty + if dataset.empty: + raise ValidationError("Dataset must not be empty.") + # check for ValueError # if target_seq is not made solely of characters ACTG check_chars = [letter in "ACTG" for letter in target_seq] From 34845cf097890d126cdbec253811712baac56f3d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:35:42 -0700 Subject: [PATCH 760/877] update function calls --- mavecore/validation/dataframe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index afb2ef0..260e126 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -202,14 +202,17 @@ def validate_values_by_column(dataset, target_seq: str): # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another for i in range(len(dataset)): if hgvs_nt: - validate_nt_variant(value=dataset.loc[i, hgvs_nt_column], - targetseq=target_seq) + validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], + column="nt", + targetseq=target_seq) if hgvs_pro: - validate_pro_variant(value=dataset.loc[i, hgvs_pro_column], + validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], + column="p", targetseq=target_seq) if hgvs_splice: - validate_splice_variant(value=dataset.loc[i, hgvs_splice_column], - targetseq=target_seq) + validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], + column="splice", + targetseq=target_seq) if score: validate_score(dataset.loc[i, required_score_column]) if hgvs_nt and not Variant(hgvs_pro).is_multi_variant(): # can only convert to single hgvs_pro variants From c52b82211de909e43e446e014a22eed577f306f1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:35:58 -0700 Subject: [PATCH 761/877] add note --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 260e126..8ad223d 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -221,6 +221,7 @@ def validate_values_by_column(dataset, target_seq: str): pro=dataset.loc[i, hgvs_pro_column], row=i) + # check that primary column, whether hgvs_nt or hgvs_pro, does not contain None values # make sure target seq is the right type # no protein target with just nt variants From 5ecaf4efc5d49f3bb45924c14a1405d28e35136f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:36:34 -0700 Subject: [PATCH 762/877] edit imports --- tests/validation/dataframe.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index fd0de67..73b2662 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -1,7 +1,22 @@ from unittest import TestCase import pandas as pd -from mavecore.validation.dataframe import * +from mavecore.validation.exceptions import ValidationError + +from mavecore.validation.constants.general import ( + hgvs_nt_column, + hgvs_pro_column, + hgvs_splice_column, + required_score_column +) + +from mavecore.validation.dataframe import ( + validate_no_null_columns_or_rows, + validate_column_names, + validate_values_by_column, + validate_score, + validate_dataframes_define_same_variants +) from mavecore.validation.constants.general import null_values_list """ From 724c1e24dbc2b21cc8a975f64b0c2502c598f13c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:37:14 -0700 Subject: [PATCH 763/877] reimplement column name validation methods --- tests/validation/dataframe.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 73b2662..6009a3d 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -75,57 +75,62 @@ def setUp(self): ) def test_valid_scores_column_names(self): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_valid_counts_column_names(self): self.dataframe = self.dataframe.drop([required_score_column], axis=1) self.dataframe["count"] = [5] - validate_column_names(self.dataframe.columns, scores=False) + validate_column_names(self.dataframe, scores=False) def test_valid_just_hgvs_nt_hgvs_column(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_valid_just_hgvs_pro_hgvs_column(self): self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) + + def test_primary_column_is_pro_when_nt_is_not_defined(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + self.dataframe.insert(0, hgvs_splice_column, ["c.1A>G"], True) + self.dataframe = validate_column_names(self.dataframe) + self.assertTrue(self.dataframe.columns[0] == hgvs_pro_column) def test_missing_hgvs_column(self): self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_pro_column, hgvs_splice_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_hgvs_in_wrong_location(self): self.dataframe = self.dataframe[[hgvs_nt_column, required_score_column, hgvs_pro_column, hgvs_splice_column]] - with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) # validation fixes problem, should pass def test_no_additional_columns_beyond_hgvs_scores_df(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_no_additional_columns_beyond_hgvs_counts_df(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns, scores=False) + validate_column_names(self.dataframe, scores=False) def test_hgvs_columns_must_be_lowercase(self): self.dataframe.rename(columns={hgvs_nt_column: hgvs_nt_column.upper()}, inplace=True) with self.assertRaises(ValueError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_null_column_name(self): for value in null_values_list: self.dataframe.rename(columns={hgvs_splice_column: value}, inplace=True) with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_no_score_column_with_scores_df(self): self.dataframe = self.dataframe.drop([required_score_column], axis=1) self.dataframe["count"] = [1] with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns) + validate_column_names(self.dataframe) def test_no_additional_column_with_counts_df(self): self.dataframe = self.dataframe.drop([required_score_column], axis=1) From cf95726604b13858165a787f53ce56daedb315f1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:37:22 -0700 Subject: [PATCH 764/877] edit imports --- tests/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 6009a3d..6a08a56 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -1,4 +1,5 @@ from unittest import TestCase +import numpy as np import pandas as pd from mavecore.validation.exceptions import ValidationError From 70a01f7e0334a51e38ee8f6a72cb0581d5922cc8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:37:39 -0700 Subject: [PATCH 765/877] add addtional test cases for column name validation --- tests/validation/dataframe.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 6a08a56..3892fcf 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -136,7 +136,28 @@ def test_no_score_column_with_scores_df(self): def test_no_additional_column_with_counts_df(self): self.dataframe = self.dataframe.drop([required_score_column], axis=1) with self.assertRaises(ValidationError): - validate_column_names(self.dataframe.columns, scores=False) + validate_column_names(self.dataframe, scores=False) + + def test_invalid_missing_either_required_hgvs_column(self): + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_nt_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + + def test_sort_column_names(self): + self.dataframe = pd.DataFrame( + { + "other": 5, + required_score_column: [1.000], + hgvs_splice_column: ["c.1A>G"], + hgvs_pro_column: ["p.Leu5Glu"], + hgvs_nt_column: ["c.1A>G"], + } + ) + dataset = validate_column_names(self.dataframe) + self.assertTrue(dataset.columns[0] == hgvs_nt_column) + self.assertTrue(dataset.columns[1] == hgvs_pro_column) + self.assertTrue(dataset.columns[2] == hgvs_splice_column) + self.assertTrue(dataset.columns[3] == required_score_column) class TestValidateVariants(TestCase): From 9b0261de810f0eb3fe8f5a66919f318ca3b138a2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:38:14 -0700 Subject: [PATCH 766/877] add data validation test methods --- tests/validation/dataframe.py | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 3892fcf..789590a 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -193,39 +193,39 @@ def test_non_numeric_values_in_score_column(self): self.dataframe[required_score_column][0] = "not a float" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) - '''data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - "I am not a number", - ) - - with self.assertRaises(ValueError): - MaveDataset.for_scores(StringIO(data))''' - #pass def test_invalid_row_hgvs_is_not_a_string(self): - '''data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - pass + self.dataframe[hgvs_nt_column][0] = 1.0 + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_empty_no_variants_parsed(self): - '''data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) + self.dataframe = self.dataframe.drop(axis='rows', index=0) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() + def test_invalid_hgvs_in_column(self): + # invalid hgvs_nt + nt_test = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) + nt_test[hgvs_nt_column][0] = "p.Thr1Ala" + with self.assertRaises(ValidationError): + validate_values_by_column(nt_test, target_seq=self.target_seq) + # invalid hgvs_pro + pro_test = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + pro_test[hgvs_pro_column][0] = "c.1A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(pro_test, target_seq=self.target_seq) + # invalid hgvs_splice + splice_test = self.dataframe.drop([hgvs_pro_column], axis=1) + splice_test[hgvs_splice_column][0] = "g.1A>G" + splice_test[hgvs_splice_column][0] = "g.1A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(splice_test, target_seq=self.target_seq) - self.assertTrue(dataset.is_empty) - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - pass + def test_invalid_variants_do_not_represent_same_change(self): + self.dataframe[hgvs_nt_column][0] = "c.3A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_same_hgvs_nt_defined_in_two_rows(self): '''hgvs = generate_hgvs(prefix="c") From cf41612968c7a64d0cdcc29b723a7e7a40eb0c09 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:38:47 -0700 Subject: [PATCH 767/877] outline additional data validation test methods --- tests/validation/dataframe.py | 58 +++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 789590a..f86b514 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -241,6 +241,64 @@ def test_invalid_same_hgvs_nt_defined_in_two_rows(self): print(dataset.errors)''' pass + def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): + '''hgvs = generate_hgvs(prefix="p") + data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) + + dataset = MaveDataset.for_counts(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_does_not_allow_wt_and_sy(self): + self.dataframe[hgvs_nt_column][0] = "_wt" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.dataframe[hgvs_nt_column][0] = "c.1A>G" + self.dataframe[hgvs_pro_column][0] = "_sy" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_error_missing_value_in_nt_column_when_nt_is_primary(self): + '''for v in constants.null_values_list: + with self.subTest(msg=v): + data = ( + "{},{},{}\n" + "{},{},1.0\n" + "{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + generate_hgvs(prefix="p"), + v, + generate_hgvs(prefix="p"), + ) + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + + def test_error_missing_value_in_pro_column_when_pro_is_primary(self): + '''for v in constants.null_values_list: + with self.subTest(msg=v): + data = "{},{}\n{},1.0\n{},1.0".format( + self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertFalse(dataset.is_valid) + self.assertEqual(len(dataset.errors), 1) + print(dataset.errors)''' + class TestValidateScore(TestCase): def test_valid_score(self): From 6d4395ef835cff0e77a4d2bd3948c0aa043945c0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:39:25 -0700 Subject: [PATCH 768/877] delete redundant test methods --- tests/validation/dataframe.py | 245 ---------------------------------- 1 file changed, 245 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index f86b514..83c460a 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -399,32 +399,7 @@ def mock_return_value(data, index=None): - def test_invalid_missing_either_required_hgvs_column(self): - '''data = "{},{}\n{},{}".format( - self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0 - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - - - - - def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): - '''hgvs = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' def test_data_method_converts_null_values_to_None(self): '''hgvs = generate_hgvs() @@ -443,57 +418,9 @@ def test_data_method_converts_null_values_to_None(self): self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) self.assertIsNone(df[self.SCORE_COL].values[0])''' - def test_sorts_header(self): - '''hgvs_nt = generate_hgvs(prefix="g") - hgvs_pro = generate_hgvs(prefix="p") - hgvs_splice = generate_hgvs(prefix="c") - data = "{},{},{},{},{}\n{},{},{},{},{}".format( - self.HGVS_PRO_COL, - self.HGVS_NT_COL, - "colA", - self.SCORE_COL, - self.HGVS_SPLICE_COL, - hgvs_pro, - hgvs_nt, - "hello", - 1.0, - hgvs_splice, - ) - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertListEqual( - dataset.columns, - [ - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - "colA", - ], - )''' - - def test_does_not_allow_wt_and_sy(self): - '''wt = "_wt" - sy = "_sy" - data = "{},{},{},{}\n{},{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - wt, - wt, - sy, - ) - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 3) - print(dataset.errors)''' def test_parses_numeric_column_values_into_float(self): '''hgvs = generate_hgvs(prefix="c") @@ -524,66 +451,8 @@ def test_does_not_split_double_quoted_variants(self): # with self.assertRaises(ValidationError): # _ = validate_variant_rows(BytesIO(data.encode()))''' - def test_primary_column_is_pro_when_nt_is_not_defined(self): - '''hgvs_pro = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_PRO_COL)''' - - def test_primary_column_is_nt_by_default(self): - '''hgvs_nt = generate_hgvs(prefix="c") - hgvs_pro = generate_hgvs(prefix="p") - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs_nt, hgvs_pro - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertEqual(dataset.index_column, self.HGVS_NT_COL)''' - - def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - '''for v in constants.null_values_list: - with self.subTest(msg=v): - data = ( - "{},{},{}\n" - "{},{},1.0\n" - "{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - v, - generate_hgvs(prefix="p"), - ) - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - def test_error_missing_value_in_pro_column_when_pro_is_primary(self): - '''for v in constants.null_values_list: - with self.subTest(msg=v): - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' def test_df_indexed_by_primary_column(self): '''data = "{},{},{}\n{},{},1.0".format( @@ -619,31 +488,7 @@ def test_invalid_duplicates_in_index(self): self.assertEqual(len(dataset.errors), 1) print(dataset.errors)''' - def test_invalid_hgvs_in_column(self): - '''tests = [ - (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), - (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), - (self.HGVS_NT_COL, generate_hgvs(prefix="p")), - ] - for (column, variant) in tests: - with self.subTest(msg=f"{column}: {variant}"): - if column == self.HGVS_SPLICE_COL: - data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - column, - self.SCORE_COL, - generate_hgvs(prefix="g"), - variant, - ) - else: - data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): '''data = "{},{}\n{},1.0\n{},2.0".format( @@ -725,92 +570,6 @@ def test_invalid_close_to_zero_is_not_parsed_as_none(self): df = dataset.data() self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' - def test_defines_same_variants(self): - '''tests = [ - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - True, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), - False, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - True, - ), - ( - "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ), - "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL - ), - False, - ), - # Check returns None if either dataset invalid - ( - "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), - "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), - None, - ), - ( - "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), - "wrong_column,count\nc.1A>G,0.0".format(), - None, - ), - ] - - for (scores, counts, expected) in tests: - with self.subTest(msg=(scores, counts, expected)): - scores_dataset = MaveDataset.for_scores(StringIO(scores)) - scores_dataset.validate() - - counts_dataset = MaveDataset.for_counts(StringIO(counts)) - counts_dataset.validate() - - self.assertEqual(scores_dataset.match_other(counts_dataset), expected)''' - - def test_to_dict(self): - '''hgvs_1 = generate_hgvs(prefix="c") - hgvs_2 = generate_hgvs(prefix="c") - data = "{},{},{},{}\n{},,,\n{},,,1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - hgvs_1, - hgvs_2, - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertDictEqual( - dataset.to_dict(), - { - hgvs_1: { - self.HGVS_NT_COL: hgvs_1, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: None, - }, - hgvs_2: { - self.HGVS_NT_COL: hgvs_2, - self.HGVS_SPLICE_COL: None, - self.HGVS_PRO_COL: None, - self.SCORE_COL: 1.0, - }, - }, - )''' def test_valid_targetseq_validation_fails(self): '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( @@ -849,7 +608,3 @@ def test_invalid_target_sequence_not_a_multiple_of_3(self): self.assertEqual(dataset.n_errors, 1) self.assertIn("multiple of 3", dataset.errors[0])''' - - #@unittest.expectedFailure - def test_invalid_relaxed_ordering_check_fails(self): - '''self.fail("Test is pending")''' From 4e5780efe8d5c728fbc109d8547becef39a71a23 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:40:08 -0700 Subject: [PATCH 769/877] reimplement validation in helper function --- mavecore/validation/utilities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index b1f20e2..0b01aa0 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -77,7 +77,8 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional else: hgvs = "p." + wt + str(position) + mutant # validate variant - validate_hgvs_string(value=hgvs, column="p", targetseq=target_seq) + Variant(hgvs) + #var.validate_hgvs_string(value=hgvs, column="p", targetseq=target_seq) return hgvs From 20fca2905a6cbc58cf16e26e90b3c5b31d8cf809 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 4 Oct 2022 14:40:38 -0700 Subject: [PATCH 770/877] update __all__ variable --- mavecore/validation/variant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/variant.py b/mavecore/validation/variant.py index 2d785d0..adf785d 100644 --- a/mavecore/validation/variant.py +++ b/mavecore/validation/variant.py @@ -10,7 +10,7 @@ hgvs_pro_column, ) -__all__ = ["validate_hgvs_string", "validate_nt_variant", "validate_pro_variant", "validate_splice_variant"] +__all__ = ["validate_hgvs_string"] from mavecore.validation.utilities import is_null From 59136bcb7b480ced6c0c95076d5656902ad321b3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:29:42 -0700 Subject: [PATCH 771/877] edit imports --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 8ad223d..5db226f 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -1,6 +1,7 @@ from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal from mavehgvs.variant import Variant +import numpy as np from mavecore.validation.constants.general import ( readable_null_values_list, From a4f81f15827f9009b6a48606476daf41620c82b0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:30:10 -0700 Subject: [PATCH 772/877] make sure splice is not defined when nt is not defined --- mavecore/validation/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 5db226f..04044e3 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -121,6 +121,10 @@ def validate_column_names(dataframe, scores=True): if not hgvs_nt and not hgvs_pro: raise ValidationError("Must include hgvs_nt or hgvs_pro column.") # or hgvs_splice column.") + # splice should not be defined in nt is not + if hgvs_splice and not hgvs_nt: + raise ValidationError("Must define hgvs_nt column if defining hgvs_splice column.") + # first columns should be hgvs columns, reorder columns to meet this requirement if score_column: score = dataframe.pop(required_score_column) From 5caa854301f681e65bc53c072ab8ab9f08773d51 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:31:02 -0700 Subject: [PATCH 773/877] make sure hgvs_pro or hgvs_nt column is present --- mavecore/validation/dataframe.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 04044e3..a0e5632 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -204,6 +204,17 @@ def validate_values_by_column(dataset, target_seq: str): else: raise ValidationError("Missing required hgvs and/or score columns.") + # check that the first column, hgvs_nt or hgvs_pro, is valid + if hgvs_nt: + validate_index_column(dataset["hgvs_nt"], hgvs="nt") + elif hgvs_pro: + validate_index_column(dataset["hgvs_pro"], hgvs="pro") + else: + raise ValidationError("Must include either hgvs_nt or hgvs_pro column.") + + # check that prefixes all match and are consistent with one another + hgvs_nt_prefix = None + # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another for i in range(len(dataset)): if hgvs_nt: From 5ba27382fae1e9df064d44b3c97ad432ff371f34 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:32:02 -0700 Subject: [PATCH 774/877] check that prefixes are consistent and edit variant validation arguments --- mavecore/validation/dataframe.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index a0e5632..4f1c286 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -220,15 +220,25 @@ def validate_values_by_column(dataset, target_seq: str): if hgvs_nt: validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], column="nt", - targetseq=target_seq) + targetseq=target_seq, + splice_present=hgvs_splice) + if hgvs_nt_prefix: + if Variant(dataset.loc[i, hgvs_nt_column]).prefix != hgvs_nt_prefix: + raise ValidationError("All prefixes within the hgvs_nt column must be the same.") + else: # assign the prefix value since it has not yet been assigned + hgvs_nt_prefix = Variant(dataset.loc[i, hgvs_nt_column]).prefix if hgvs_pro: validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], column="p", - targetseq=target_seq) + targetseq=target_seq, + splice_present=hgvs_splice) if hgvs_splice: validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], column="splice", - targetseq=target_seq) + targetseq=target_seq, + splice_present=hgvs_splice) + if hgvs_nt_prefix != 'g': + raise ValidationError("hgvs_nt prefix must be genomic when splice present.") if score: validate_score(dataset.loc[i, required_score_column]) if hgvs_nt and not Variant(hgvs_pro).is_multi_variant(): # can only convert to single hgvs_pro variants From 84f81a5745d32def1e9bbbe39cdc0724ce9be8b7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:32:25 -0700 Subject: [PATCH 775/877] edit score validation and variant validation arguments --- mavecore/validation/dataframe.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 4f1c286..2325fea 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -240,12 +240,14 @@ def validate_values_by_column(dataset, target_seq: str): if hgvs_nt_prefix != 'g': raise ValidationError("hgvs_nt prefix must be genomic when splice present.") if score: - validate_score(dataset.loc[i, required_score_column]) - if hgvs_nt and not Variant(hgvs_pro).is_multi_variant(): # can only convert to single hgvs_pro variants - validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, - nt=dataset.loc[i, hgvs_nt_column], - pro=dataset.loc[i, hgvs_pro_column], - row=i) + s = validate_score(dataset.loc[i, required_score_column]) + dataset.loc[i, required_score_column] = s + if hgvs_nt and hgvs_pro: + if not Variant(dataset.loc[i, hgvs_pro_column]).is_multi_variant(): # can only convert to single hgvs_pro variants + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, + nt=dataset.loc[i, hgvs_nt_column], + pro=dataset.loc[i, hgvs_pro_column], + row=i) # check that primary column, whether hgvs_nt or hgvs_pro, does not contain None values # make sure target seq is the right type From 4271d6b86ff2d0164adc44398db98f556456200b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:33:01 -0700 Subject: [PATCH 776/877] define and implement index column validation --- mavecore/validation/dataframe.py | 33 +++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 2325fea..4a8c831 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -254,7 +254,38 @@ def validate_values_by_column(dataset, target_seq: str): # no protein target with just nt variants -def validate_score(score: float): +def validate_index_column(column, hgvs: str): + """ + Validates the first column in a dataframe, should be hgvs_nt or hgvs_pro. All values in the column should be + unique and there should be no missing values. + + Parameters + __________ + column : list + The column that will be validated. + hgvs : str + Indicates whether or not the column is an hgvs_nt or hgvs_pro column. Can have value "nt" or "pro". + + Raises + ______ + ValidationError + If there are duplicate values in the column. + ValidationError + If there are missing values in the column. + """ + col_set = set(column) + if len(col_set) != len(column): + raise ValidationError( + "Each value in hgvs_'{}' column must be unique.".format(hgvs) + ) + if np.nan in col_set: + print("lasjdfljsadl;jflsjf;sjdlfj") + raise ValidationError( + "Primary column (hgvs_'{}') must not contain missing values.".format(hgvs) + ) + + +def validate_score(score): # TODO we probably dont need this if type(score) != float: raise ValidationError( From e2648ed6c55d2164a5df4549dc27beedc713b16a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:33:33 -0700 Subject: [PATCH 777/877] reimplement score validation to cast values to float if allowed, else raise error --- mavecore/validation/dataframe.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 4a8c831..f5524c7 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -287,11 +287,12 @@ def validate_index_column(column, hgvs: str): def validate_score(score): # TODO we probably dont need this - if type(score) != float: - raise ValidationError( - "Each value in score column must by a float. " - "'{}' has the type '{}'.".format(score, type(score).__name__) - ) + try: + score = float(score) + except ValueError: + raise ValidationError("Each value in score column must by a float. " + "'{}' has the type '{}'.".format(score, type(score).__name__)) + return score def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str, pro: str, row: int): From fb5f719467e5707cfd4d93bf3bbe352f29222ff8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:33:53 -0700 Subject: [PATCH 778/877] edit imports --- tests/validation/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 83c460a..fc6ea69 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -16,7 +16,8 @@ validate_column_names, validate_values_by_column, validate_score, - validate_dataframes_define_same_variants + validate_dataframes_define_same_variants, + validate_index_column ) from mavecore.validation.constants.general import null_values_list From 6b35cbc725eddc32492a052b5462c83075763bd7 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:34:11 -0700 Subject: [PATCH 779/877] delete redundant imports, add comment --- tests/validation/dataframe.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index fc6ea69..37ea7ad 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -20,25 +20,7 @@ validate_index_column ) from mavecore.validation.constants.general import null_values_list - -""" -from io import BytesIO, StringIO - -from mavecore.validation import constants - -from mavecore.validation.dataset_validators import ( - validate_scoreset_count_data_input, - validate_scoreset_score_data_input, - validate_at_least_one_additional_column, - validate_has_hgvs_in_header, - validate_header_contains_no_null_columns, - read_header_from_io, - validate_scoreset_json, - validate_datasets_define_same_variants, -) - - -""" +# let pandas handle the types of null values to allow class TestValidateNoNullColumnsOrRows(TestCase): From d3ba718db25d54763297d82d69d2ac45dc33e074 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:34:43 -0700 Subject: [PATCH 780/877] change None to np.nan in test cases --- tests/validation/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 37ea7ad..9d646c4 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -37,12 +37,12 @@ def test_valid(self): validate_no_null_columns_or_rows(self.dataframe) def test_null_row(self): - self.dataframe.loc[1] = [None, None, None] + self.dataframe.loc[1] = [np.nan, np.nan, np.nan] with self.assertRaises(AssertionError): validate_no_null_columns_or_rows(self.dataframe) def test_null_column(self): - self.dataframe[hgvs_pro_column][0] = None + self.dataframe[hgvs_pro_column][0] = np.nan with self.assertRaises(AssertionError): validate_no_null_columns_or_rows(self.dataframe) From 3f7a21b7e4c9f911da77dd3a9d60b16955c169e0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:35:06 -0700 Subject: [PATCH 781/877] add test case for column names --- tests/validation/dataframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 9d646c4..e06d17c 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -126,6 +126,11 @@ def test_invalid_missing_either_required_hgvs_column(self): with self.assertRaises(ValidationError): validate_column_names(self.dataframe, scores=False) + def test_invalid_splice_column_defined_when_nt_column_is_not(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe, scores=False) + def test_sort_column_names(self): self.dataframe = pd.DataFrame( { From 19a3e4f2795064ea0ad7e8955e98dc1c1875151b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:35:38 -0700 Subject: [PATCH 782/877] edit setUp method to validate values by column --- tests/validation/dataframe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index e06d17c..e1223fd 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -167,13 +167,13 @@ def test_invalid_variants(self): class TestValidateValuesByColumn(TestCase): def setUp(self): - self.target_seq = "ACA" + self.target_seq = "ATGACA" self.dataframe = pd.DataFrame( { - hgvs_nt_column: ["c.1A>G"], - hgvs_pro_column: ["p.Thr1Ala"], - hgvs_splice_column: ["c.1A>G"], - required_score_column: [1.000], + hgvs_nt_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_pro_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + hgvs_splice_column: ["c.4A>G", "c.5C>G", "c.6A>G"], + required_score_column: [1.000, 0.5, 1.5], } ) From 2ee0ff6cfa73be1ebf58f32563beed7319ccd73c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:36:06 -0700 Subject: [PATCH 783/877] add test case to test for valid dataframe when validating values by column --- tests/validation/dataframe.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index e1223fd..d19b6ba 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -177,6 +177,9 @@ def setUp(self): } ) + def test_valid(self): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + def test_non_numeric_values_in_score_column(self): self.dataframe[required_score_column][0] = "not a float" with self.assertRaises(ValidationError): From acf48605d45541d4f0f29e9aa3e385bb901775b6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:36:25 -0700 Subject: [PATCH 784/877] drop extra rows to reflect changes in setUp method --- tests/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index d19b6ba..7fcb690 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -191,7 +191,7 @@ def test_invalid_row_hgvs_is_not_a_string(self): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_empty_no_variants_parsed(self): - self.dataframe = self.dataframe.drop(axis='rows', index=0) + self.dataframe = self.dataframe.drop(axis='rows', index=[0, 1, 2]) with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) From 70b65b97e4256a5489e69dfcb5477124b45cb75f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:36:47 -0700 Subject: [PATCH 785/877] refactor test case into three separate test cases --- tests/validation/dataframe.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 7fcb690..1275800 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -195,18 +195,19 @@ def test_empty_no_variants_parsed(self): with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) - def test_invalid_hgvs_in_column(self): - # invalid hgvs_nt + def test_invalid_hgvs_nt_in_column(self): nt_test = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) nt_test[hgvs_nt_column][0] = "p.Thr1Ala" with self.assertRaises(ValidationError): validate_values_by_column(nt_test, target_seq=self.target_seq) - # invalid hgvs_pro + + def test_invalid_hgvs_pro_in_column(self): pro_test = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) pro_test[hgvs_pro_column][0] = "c.1A>G" with self.assertRaises(ValidationError): validate_values_by_column(pro_test, target_seq=self.target_seq) - # invalid hgvs_splice + + def test_invalid_hgvs_splice_in_column(self): splice_test = self.dataframe.drop([hgvs_pro_column], axis=1) splice_test[hgvs_splice_column][0] = "g.1A>G" splice_test[hgvs_splice_column][0] = "g.1A>G" From a624abf7ca50c076af90bf3afc42a2cca5a2b2f1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:37:00 -0700 Subject: [PATCH 786/877] refactor test case into two separate test cases --- tests/validation/dataframe.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 1275800..e46fb7a 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -219,35 +219,12 @@ def test_invalid_variants_do_not_represent_same_change(self): with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) - def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - pass - - def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): - '''hgvs = generate_hgvs(prefix="p") - data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) - - dataset = MaveDataset.for_counts(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_does_not_allow_wt_and_sy(self): + def test_does_not_allow_wt(self): self.dataframe[hgvs_nt_column][0] = "_wt" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_does_not_allow_sy(self): self.dataframe[hgvs_nt_column][0] = "c.1A>G" self.dataframe[hgvs_pro_column][0] = "_sy" with self.assertRaises(ValidationError): From 81d71bf228e9c741b2f8260fa6681c4998bb8853 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:37:34 -0700 Subject: [PATCH 787/877] implement suite of new test cases --- tests/validation/dataframe.py | 130 ++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index e46fb7a..ebfbbf0 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -230,6 +230,136 @@ def test_does_not_allow_sy(self): with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) + # TODO this should be handled by pandas + def test_data_method_converts_null_values_to_None(self): + '''hgvs = generate_hgvs() + for value in constants.null_values_list: + with self.subTest(msg=value): + data = "{},{}\n{},{}".format( + self.HGVS_NT_COL, self.SCORE_COL, hgvs, value + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + + df = dataset.data(serializable=True) + self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) + self.assertIsNone(df[self.SCORE_COL].values[0])''' + + # TODO not sure if we want to do this + def test_parses_numeric_column_values_into_float(self): + self.dataframe[required_score_column][0] = "1.1" + self.assertTrue(type(self.dataframe[required_score_column][0]) == str) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.assertTrue(type(self.dataframe[required_score_column][0]) == float) + self.dataframe[required_score_column][0] = 1 + self.assertTrue(type(self.dataframe[required_score_column][0]) == int) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.assertTrue(type(self.dataframe[required_score_column][0]) == float) + + def test_does_not_split_double_quoted_variants(self): + '''hgvs = "c.[123A>G;124A>G]" + data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) + + # def test_invalid_non_double_quoted_multi_variant_row(self): + # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) + # data = "{},{}\n'{}',1.0".format( + # constants.hgvs_nt_column, required_score_column, hgvs + # ) + # with self.assertRaises(ValidationError): + # _ = validate_variant_rows(BytesIO(data.encode()))''' + + # TODO may not need to include this + def test_df_indexed_by_primary_column(self): + '''data = "{},{},{}\n{},{},1.0".format( + self.HGVS_NT_COL, + self.HGVS_PRO_COL, + self.SCORE_COL, + generate_hgvs(prefix="c"), + generate_hgvs(prefix="p"), + ) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + assert_index_equal(dataset.data().index, dataset.index)''' + + def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): + self.dataframe[hgvs_nt_column][0] = "c.4A>G" + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_nt_not_genomic_when_splice_present(self): + self.dataframe[hgvs_nt_column][0] = "c.4A>G" + self.dataframe[hgvs_nt_column][1] = "c.5C>G" + self.dataframe[hgvs_nt_column][2] = "c.6A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_splice_not_defined_when_nt_is_genomic(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_invalid_zero_is_not_parsed_as_none(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + df = dataset.data() + self.assertEqual(df[self.SCORE_COL].values[0], 0)''' + + def test_invalid_close_to_zero_is_not_parsed_as_none(self): + '''hgvs = generate_hgvs(prefix="c") + data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) + + dataset = MaveDataset.for_scores(StringIO(data)) + dataset.validate() + + self.assertTrue(dataset.is_valid) + df = dataset.data() + self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' + + +class TestValidateIndexColumn(TestCase): + def setUp(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["c.1A>G", "c.2C>G", "c.3A>G"], + hgvs_pro_column: ["p.Thr1Ala", "p.Thr1Arg", "p.="], + required_score_column: [1.0, 0.5, 1.5], + } + ) + + def test_valid(self): + validate_index_column(self.dataframe["hgvs_nt"], "nt") + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + validate_index_column(self.dataframe["hgvs_pro"], "pro") + + def test_invalid_same_hgvs_nt_defined_in_two_rows(self): + self.dataframe[hgvs_nt_column][0] = "c.2C>G" + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_nt"], "nt") + + def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro_when_pro_is_primary_column(self): + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + self.dataframe[hgvs_pro_column][0] = "p.Thr1Arg" + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_pro"], "pro") + def test_error_missing_value_in_nt_column_when_nt_is_primary(self): '''for v in constants.null_values_list: with self.subTest(msg=v): From c5f7154ac502a318ca5a606a6759b60992a25293 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:38:20 -0700 Subject: [PATCH 788/877] implement test cases to check for missing values in primary columns --- tests/validation/dataframe.py | 42 +++++++---------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index ebfbbf0..20b5939 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -361,42 +361,16 @@ def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro_when_pro_is_primar validate_index_column(self.dataframe["hgvs_pro"], "pro") def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - '''for v in constants.null_values_list: - with self.subTest(msg=v): - data = ( - "{},{},{}\n" - "{},{},1.0\n" - "{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - v, - generate_hgvs(prefix="p"), - ) - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' + self.dataframe[hgvs_nt_column][0] = np.nan + print(self.dataframe) + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_nt"], "nt") def test_error_missing_value_in_pro_column_when_pro_is_primary(self): - '''for v in constants.null_values_list: - with self.subTest(msg=v): - data = "{},{}\n{},1.0\n{},1.0".format( - self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' + self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) + self.dataframe[hgvs_pro_column][0] = np.nan + with self.assertRaises(ValidationError): + validate_index_column(self.dataframe["hgvs_pro"], "pro") class TestValidateScore(TestCase): From 2c0fc442c884afdbf1bd733c5f4b9bef06b460f9 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:38:41 -0700 Subject: [PATCH 789/877] delete redundant test cases --- tests/validation/dataframe.py | 197 ---------------------------------- 1 file changed, 197 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 20b5939..efcab4e 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -430,203 +430,6 @@ def test_counts_defines_different_pro_variants(self): with self.assertRaises(ValidationError): validate_dataframes_define_same_variants(self.scores, self.counts) -""" -from io import StringIO - -import pandas as pd -from pandas.testing import assert_index_equal - -# from dataset import constants -from mavecore.validation import constants - -from mavecore.validation.variant_validators import ( - MaveDataset, -) -""" - - -class TestMaveDataset(TestCase): - """ - Tests the validator :func:`validate_variant_rows` to check if the correct - errors are thrown when invalid rows are encountered in a - scores/counts/meta data input file. Checks for: - - Invalid HGVS string in a row - - Row HGVS is defined in more than one row - - Row values are not int/float for a count/score file - - Tests also check to see if the correct header and hgvs data information - is parsed and returned. - """ - - '''SCORE_COL = constants.required_score_column - HGVS_NT_COL = constants.hgvs_nt_column - HGVS_SPLICE_COL = constants.hgvs_splice_column - HGVS_PRO_COL = constants.hgvs_pro_column''' - - @staticmethod - def mock_return_value(data, index=None): - '''df = pd.read_csv(StringIO(data), sep=",", na_values=["None", None]) - if index: - df.index = pd.Index(df[index]) - return df''' - - - - - - def test_data_method_converts_null_values_to_None(self): - '''hgvs = generate_hgvs() - for value in constants.null_values_list: - with self.subTest(msg=value): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, value - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - - df = dataset.data(serializable=True) - self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) - self.assertIsNone(df[self.SCORE_COL].values[0])''' - - - - - - def test_parses_numeric_column_values_into_float(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - value = dataset.data()[self.SCORE_COL].values[0] - self.assertIsInstance(value, float)''' - - def test_does_not_split_double_quoted_variants(self): - '''hgvs = "c.[123A>G;124A>G]" - data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL]) - - # def test_invalid_non_double_quoted_multi_variant_row(self): - # hgvs = "{},{}".format(generate_hgvs(), generate_hgvs()) - # data = "{},{}\n'{}',1.0".format( - # constants.hgvs_nt_column, required_score_column, hgvs - # ) - # with self.assertRaises(ValidationError): - # _ = validate_variant_rows(BytesIO(data.encode()))''' - - - - - def test_df_indexed_by_primary_column(self): - '''data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - assert_index_equal(dataset.data().index, dataset.index)''' - - def test_invalid_duplicates_in_index(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - hgvs, - generate_hgvs(prefix="p"), - hgvs, - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - - - def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): - '''data = "{},{}\n{},1.0\n{},2.0".format( - self.HGVS_NT_COL, - self.SCORE_COL, - generate_hgvs(prefix="g"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors)''' - - def test_invalid_nt_not_genomic_when_splice_present(self): - '''data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_invalid_splice_defined_when_nt_is_not(self): - '''data = "{},{},{}\n,{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_SPLICE_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 1) - print(dataset.errors)''' - - def test_invalid_splice_not_defined_when_nt_is_genomic(self): - '''data = "{},{}\n{},1.0".format( - self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g") - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertFalse(dataset.is_valid) - self.assertEqual(len(dataset.errors), 2) - print(dataset.errors)''' - - def test_invalid_zero_is_not_parsed_as_none(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() self.assertTrue(dataset.is_valid) df = dataset.data() From 3c714ea1227cd2958d692dbf928817176bcd5d0e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 11 Oct 2022 17:39:13 -0700 Subject: [PATCH 790/877] add test class to validate target seq with regards to variants --- tests/validation/dataframe.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index efcab4e..7bdb455 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -431,21 +431,7 @@ def test_counts_defines_different_pro_variants(self): validate_dataframes_define_same_variants(self.scores, self.counts) - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 0)''' - - def test_invalid_close_to_zero_is_not_parsed_as_none(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' - +class TestTargetSeqIsValidWithRegardsToVariants(TestCase): def test_valid_targetseq_validation_fails(self): '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( From 91e03b8c0cb864c17b2b849e804e5259181f7ee3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:09:55 -0700 Subject: [PATCH 791/877] update test method to reflect changes in validation --- tests/validation/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 7bdb455..716ad20 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -75,8 +75,8 @@ def test_valid_just_hgvs_pro_hgvs_column(self): validate_column_names(self.dataframe) def test_primary_column_is_pro_when_nt_is_not_defined(self): - self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) - self.dataframe.insert(0, hgvs_splice_column, ["c.1A>G"], True) + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column, required_score_column], axis=1) + self.dataframe.insert(0, required_score_column, [1.000], True) self.dataframe = validate_column_names(self.dataframe) self.assertTrue(self.dataframe.columns[0] == hgvs_pro_column) From f8ede3e0acea6111dc03c21b5fe6378e9ca3643c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:10:00 -0700 Subject: [PATCH 792/877] update test method to reflect changes in validation --- tests/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 716ad20..b643a77 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -138,7 +138,7 @@ def test_sort_column_names(self): required_score_column: [1.000], hgvs_splice_column: ["c.1A>G"], hgvs_pro_column: ["p.Leu5Glu"], - hgvs_nt_column: ["c.1A>G"], + hgvs_nt_column: ["g.1A>G"], } ) dataset = validate_column_names(self.dataframe) From cd054d308067b295be58156c5560b1301686f6fe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:11:20 -0700 Subject: [PATCH 793/877] refactor test methods to prevent error messages --- tests/validation/dataframe.py | 42 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index b643a77..771e06f 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -181,12 +181,12 @@ def test_valid(self): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_non_numeric_values_in_score_column(self): - self.dataframe[required_score_column][0] = "not a float" + self.dataframe.loc[0, [required_score_column]] = "not a float" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_row_hgvs_is_not_a_string(self): - self.dataframe[hgvs_nt_column][0] = 1.0 + self.dataframe.loc[0, [hgvs_nt_column]] = 1.0 with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) @@ -196,37 +196,35 @@ def test_empty_no_variants_parsed(self): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_hgvs_nt_in_column(self): - nt_test = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) - nt_test[hgvs_nt_column][0] = "p.Thr1Ala" + self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "p.Thr1Ala" with self.assertRaises(ValidationError): - validate_values_by_column(nt_test, target_seq=self.target_seq) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_hgvs_pro_in_column(self): - pro_test = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) - pro_test[hgvs_pro_column][0] = "c.1A>G" + self.dataframe = self.dataframe.drop([hgvs_nt_column, hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_pro_column]] = "c.1A>G" with self.assertRaises(ValidationError): - validate_values_by_column(pro_test, target_seq=self.target_seq) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_hgvs_splice_in_column(self): - splice_test = self.dataframe.drop([hgvs_pro_column], axis=1) - splice_test[hgvs_splice_column][0] = "g.1A>G" - splice_test[hgvs_splice_column][0] = "g.1A>G" + self.dataframe = self.dataframe.drop([hgvs_pro_column], axis=1) + self.dataframe.loc[0, [hgvs_splice_column]] = "g.1A>G" with self.assertRaises(ValidationError): - validate_values_by_column(splice_test, target_seq=self.target_seq) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_variants_do_not_represent_same_change(self): - self.dataframe[hgvs_nt_column][0] = "c.3A>G" + self.dataframe.loc[0, [hgvs_nt_column]] = "c.3A>G" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_does_not_allow_wt(self): - self.dataframe[hgvs_nt_column][0] = "_wt" + self.dataframe.loc[0, [hgvs_nt_column]] = "_wt" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_does_not_allow_sy(self): - self.dataframe[hgvs_nt_column][0] = "c.1A>G" - self.dataframe[hgvs_pro_column][0] = "_sy" + self.dataframe.loc[0, [hgvs_pro_column]] = "_sy" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) @@ -250,11 +248,11 @@ def test_data_method_converts_null_values_to_None(self): # TODO not sure if we want to do this def test_parses_numeric_column_values_into_float(self): - self.dataframe[required_score_column][0] = "1.1" + self.dataframe.loc[0, [required_score_column]] = "1.1" self.assertTrue(type(self.dataframe[required_score_column][0]) == str) validate_values_by_column(self.dataframe, target_seq=self.target_seq) self.assertTrue(type(self.dataframe[required_score_column][0]) == float) - self.dataframe[required_score_column][0] = 1 + self.dataframe.loc[0, [required_score_column]] = 1 self.assertTrue(type(self.dataframe[required_score_column][0]) == int) validate_values_by_column(self.dataframe, target_seq=self.target_seq) self.assertTrue(type(self.dataframe[required_score_column][0]) == float) @@ -294,15 +292,15 @@ def test_df_indexed_by_primary_column(self): assert_index_equal(dataset.data().index, dataset.index)''' def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): - self.dataframe[hgvs_nt_column][0] = "c.4A>G" + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_nt_not_genomic_when_splice_present(self): - self.dataframe[hgvs_nt_column][0] = "c.4A>G" - self.dataframe[hgvs_nt_column][1] = "c.5C>G" - self.dataframe[hgvs_nt_column][2] = "c.6A>G" + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "c.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "c.6A>G" with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) From 5486d6370191eb720e6d1dcad88cd158c1b21301 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:11:31 -0700 Subject: [PATCH 794/877] remove print statement --- tests/validation/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 771e06f..e77e0ff 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -360,7 +360,6 @@ def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro_when_pro_is_primar def test_error_missing_value_in_nt_column_when_nt_is_primary(self): self.dataframe[hgvs_nt_column][0] = np.nan - print(self.dataframe) with self.assertRaises(ValidationError): validate_index_column(self.dataframe["hgvs_nt"], "nt") From 85c76cf5d459a523e5e110829931bc70040811b2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:14:46 -0700 Subject: [PATCH 795/877] remove print statement --- mavecore/validation/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index f5524c7..a046ca3 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -279,7 +279,6 @@ def validate_index_column(column, hgvs: str): "Each value in hgvs_'{}' column must be unique.".format(hgvs) ) if np.nan in col_set: - print("lasjdfljsadl;jflsjf;sjdlfj") raise ValidationError( "Primary column (hgvs_'{}') must not contain missing values.".format(hgvs) ) From 7855aa65ebd01bcd7b067628c0da512f71302807 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:15:25 -0700 Subject: [PATCH 796/877] add notes --- mavecore/validation/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index a046ca3..5369ee9 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -14,6 +14,10 @@ from mavecore.validation.variant import validate_hgvs_string from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro, is_null +# handle with pandas all null strings +# provide a csv or a pandas dataframe +# take dataframe, output as csv to temp directory, use standard library + def validate_dataframes(target_seq: str, scores, counts=None): """ From e53c0345189dbcd934517ad44c81c264ce0896fa Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 12 Oct 2022 12:15:59 -0700 Subject: [PATCH 797/877] update test cases to remove errors --- tests/validation/dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index e77e0ff..cb078cd 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -348,24 +348,24 @@ def test_valid(self): validate_index_column(self.dataframe["hgvs_pro"], "pro") def test_invalid_same_hgvs_nt_defined_in_two_rows(self): - self.dataframe[hgvs_nt_column][0] = "c.2C>G" + self.dataframe.loc[0, [hgvs_nt_column]] = "c.2C>G" with self.assertRaises(ValidationError): validate_index_column(self.dataframe["hgvs_nt"], "nt") def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro_when_pro_is_primary_column(self): self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) - self.dataframe[hgvs_pro_column][0] = "p.Thr1Arg" + self.dataframe.loc[0, [hgvs_pro_column]] = "p.Thr1Arg" with self.assertRaises(ValidationError): validate_index_column(self.dataframe["hgvs_pro"], "pro") def test_error_missing_value_in_nt_column_when_nt_is_primary(self): - self.dataframe[hgvs_nt_column][0] = np.nan + self.dataframe.loc[0, [hgvs_nt_column]] = np.nan with self.assertRaises(ValidationError): validate_index_column(self.dataframe["hgvs_nt"], "nt") def test_error_missing_value_in_pro_column_when_pro_is_primary(self): self.dataframe = self.dataframe.drop([hgvs_nt_column], axis=1) - self.dataframe[hgvs_pro_column][0] = np.nan + self.dataframe.loc[0, [hgvs_pro_column]] = np.nan with self.assertRaises(ValidationError): validate_index_column(self.dataframe["hgvs_pro"], "pro") From de90affc64f76d06fc536e4d9457bdec307be66c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:49:17 -0400 Subject: [PATCH 798/877] update docstring --- mavecore/validation/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 5369ee9..53eea3c 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -309,8 +309,10 @@ def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str The target sequence associated withe variants. nt : str The hgvs_nt string. - pro : list + pro : str The hgvs_pro string. + row : int + The row that the current hgvs strings being evaluated are in. Raises ______ From c0592afc102b353f77968d0ae61c469f9e5505a2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:49:28 -0400 Subject: [PATCH 799/877] update inputs --- tests/validation/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index cb078cd..1486d84 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -17,7 +17,8 @@ validate_values_by_column, validate_score, validate_dataframes_define_same_variants, - validate_index_column + validate_index_column, + validate_hgvs_nt_and_hgvs_pro_represent_same_change, ) from mavecore.validation.constants.general import null_values_list # let pandas handle the types of null values to allow From 29446f271aa93fe67d00e42c199e6fe8d4bbd5ef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:49:55 -0400 Subject: [PATCH 800/877] add unittest for checking mismatched variants and column names --- tests/validation/dataframe.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 1486d84..50e77c3 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -332,6 +332,18 @@ def test_invalid_close_to_zero_is_not_parsed_as_none(self): df = dataset.data() self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' + def test_mismatched_variants_and_column_names(self): + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + hgvs_pro_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_splice_column: ["c.4A>G", "c.5C>G", "c.6A>G"], + required_score_column: [1.000, 0.5, 1.5], + } + ) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + class TestValidateIndexColumn(TestCase): def setUp(self): From b6dc4039ad8156b0db55f1d4f039dae0800a677d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:50:17 -0400 Subject: [PATCH 801/877] delete redundant test class and methods --- tests/validation/dataframe.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 50e77c3..5191d31 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -392,14 +392,6 @@ def test_invalid_score(self): validate_score("a") -class TestVariantsMatchHgvsColumnNames(TestCase): - def test_valid(self): - pass - - def test_mismatched_variants_and_column_names(self): - pass - - class TestHgvsColumnsDefineSameVariants(TestCase): def test_valid(self): pass From dafa9245cb0c661113ec880cf2c15cbc973f7b9c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 11:50:45 -0400 Subject: [PATCH 802/877] implement test class and methods for checking that variants represent same change --- tests/validation/dataframe.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 5191d31..51010c9 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -393,8 +393,31 @@ def test_invalid_score(self): class TestHgvsColumnsDefineSameVariants(TestCase): + def setUp(self): + self.target_seq = "ATGACA" + self.dataframe = pd.DataFrame( + { + hgvs_nt_column: ["g.4A>G", "g.5C>G", "g.6A>G"], + hgvs_pro_column: ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="], + required_score_column: [1.000, 0.5, 1.5], + } + ) + def test_valid(self): - pass + for i in range(3): + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=self.target_seq, + nt=self.dataframe[hgvs_nt_column][i], + pro=self.dataframe[hgvs_pro_column][i], + row=i) + + def test_invalid_nt_and_pro_do_not_represent_same_change(self): + self.dataframe.loc[0, [hgvs_nt_column]] = "g.2C>G" + with self.assertRaises(ValidationError): + for i in range(3): + validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=self.target_seq, + nt=self.dataframe[hgvs_nt_column][i], + pro=self.dataframe[hgvs_pro_column][i], + row=i) class TestDataframesDefineSameVariants(TestCase): From fcb9b1d36d338bbe1ae64ae0d8dec704ba6f47bf Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:35:57 -0400 Subject: [PATCH 803/877] outline unittest for target --- tests/validation/target.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/validation/target.py b/tests/validation/target.py index e69de29..6b04d62 100644 --- a/tests/validation/target.py +++ b/tests/validation/target.py @@ -0,0 +1,18 @@ +from unittest import TestCase + +from mavecore.validation.target import * + + +class TestValidateTargetCategory(TestCase): + def test_valid(self): + pass + + +class TestValidateSequenceCategory(TestCase): + def test_valid(self): + pass + + +class TestValidateTargetSequence(TestCase): + def test_valid(self): + pass From a7a321980dec7048d91f48040e0b71d29e8647e0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:36:04 -0400 Subject: [PATCH 804/877] remove pass --- tests/validation/urn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index 4723144..ac2b33f 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -13,7 +13,6 @@ def test_invalid_mavedb_urn(self): def test_valid_mavedb_urn_experimentset(self): validate_mavedb_urn_experimentset("") - pass def test_invalid_mavedb_urn_experimentset(self): with self.assertRaises(ValidationError): From 34a23515dc98607c1a49c0c7be3ddca52489864d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:36:54 -0400 Subject: [PATCH 805/877] add code to validate target sequence --- mavecore/validation/target.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mavecore/validation/target.py b/mavecore/validation/target.py index 3991f31..591425e 100644 --- a/mavecore/validation/target.py +++ b/mavecore/validation/target.py @@ -38,3 +38,25 @@ def validate_sequence_category(sequence_type: str): if sequence_type not in valid_sequence_types: raise ValidationError("{}'s is not a valid sequence type. Valid sequence types are " "Infer, DNA, and Protein".format(sequence_type)) + + +def validate_target_sequence(target_seq: str): + """ + Validates a target sequence. The sequence should consists of only ACTG and the length should be a multiple of 3. + + Parameters + __________ + sequence : str + The target sequence that will be validated. + + Raises + ______ + ValidationError + If the target sequence does not consist of ACTG or if the length of the sequence is not a multiple of 3. + """ + # if target_seq is not made solely of characters ACTG + check_chars = [letter in "ACTG" for letter in target_seq] + if False in check_chars: + raise ValidationError("target_seq is invalid, must be composed only of bases ACTG.") + if len(target_seq) % 3 != 0: + raise ValidationError("target_seq is invalid, length must be a multiple of three.") \ No newline at end of file From cbceaca47c048e8dd1799ed278708406a260256f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:37:35 -0400 Subject: [PATCH 806/877] update setUp methods to reflect changes in validation --- tests/models/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/data.py b/tests/models/data.py index 570b14d..e361be0 100644 --- a/tests/models/data.py +++ b/tests/models/data.py @@ -59,8 +59,8 @@ def setUp(self): doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} pubmed_identifier = {"identifier": "29785012"} reference_map = {"genomeId": 0, "targetId": 0} - sequence = {"sequenceType": "DNA", "sequence": "ATCG"} - external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} + sequence = {"sequenceType": "DNA", "sequence": "ATC"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} external_identifier = {"identifier": external_identifier_id, "offset": 0} target = {"name": "name", "category": "Protein coding", From b1fc10ab81a653a5740a70f9babfd155eb94d943 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:37:46 -0400 Subject: [PATCH 807/877] add import statement --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 53eea3c..32fe3d4 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -13,6 +13,7 @@ from mavecore.validation.exceptions import ValidationError from mavecore.validation.variant import validate_hgvs_string from mavecore.validation.utilities import convert_hgvs_nt_to_hgvs_pro, is_null +from mavecore.validation.target import validate_target_sequence # handle with pandas all null strings # provide a csv or a pandas dataframe From a523b04844546ee80aa671e0961bfd6d792bbb18 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:38:30 -0400 Subject: [PATCH 808/877] refactor code to be more modular, call helper function instead of writing code here --- mavecore/validation/dataframe.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 32fe3d4..815712c 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -186,11 +186,8 @@ def validate_values_by_column(dataset, target_seq: str): if dataset.empty: raise ValidationError("Dataset must not be empty.") - # check for ValueError - # if target_seq is not made solely of characters ACTG - check_chars = [letter in "ACTG" for letter in target_seq] - if False in check_chars: - raise ValidationError("target_seq is invalid, must be composed only of bases ACTG.") + # validate target sequence + validate_target_sequence(target_seq) # first check the column names, establish the order or the hgvs and score columns hgvs_nt = False From 782afc5392eabb6102b0267f82490a7382c466f0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:38:52 -0400 Subject: [PATCH 809/877] delete redundant test cases, this is tested elsewhere --- tests/validation/dataframe.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 51010c9..0e72de4 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -149,23 +149,6 @@ def test_sort_column_names(self): self.assertTrue(dataset.columns[3] == required_score_column) -class TestValidateVariants(TestCase): - def setUp(self): - self.dataframe = pd.DataFrame( - { - hgvs_nt_column: ["c.1A>G", "c.1A>G", "c.1A>G"], - hgvs_pro_column: ["p.Leu5Glu", "p.Leu5Glu", "p.Leu5Glu"], - hgvs_splice_column: ["c.1A>G", "c.1A>G", "c.1A>G"], - } - ) - - def test_valid_variants(self): - pass #validate_variants(self.dataframe[hgvs_nt_column], hgvs_nt_column) - - def test_invalid_variants(self): - pass - - class TestValidateValuesByColumn(TestCase): def setUp(self): self.target_seq = "ATGACA" From 740c2b436071fed89694d2d8d9a78c7f4709fc34 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:39:52 -0400 Subject: [PATCH 810/877] delete redundant test cases and class, this is tested elsewhere --- tests/validation/dataframe.py | 41 ----------------------------------- 1 file changed, 41 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 0e72de4..708cdf1 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -437,44 +437,3 @@ def test_counts_defines_different_pro_variants(self): self.counts[hgvs_pro_column][0] = "p.Leu75Glu" with self.assertRaises(ValidationError): validate_dataframes_define_same_variants(self.scores, self.counts) - - -class TestTargetSeqIsValidWithRegardsToVariants(TestCase): - - def test_valid_targetseq_validation_fails(self): - '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertTrue(dataset.is_valid)''' - - def test_invalid_targetseq_validation_fails(self): - '''data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATC") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("p.Val1Phe", dataset.errors[0])''' - - def test_invalid_target_sequence_not_a_multiple_of_3(self): - '''data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format( - self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate(targetseq="ATCG") - - self.assertFalse(dataset.is_valid) - print(dataset.errors) - - self.assertEqual(dataset.n_errors, 1) - self.assertIn("multiple of 3", dataset.errors[0])''' From e27f618c6fef4b0a5a75a6f1a82fe64008a0c8c8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:40:13 -0400 Subject: [PATCH 811/877] reformat --- tests/models/identifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index 7bf787e..8f87d73 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -7,7 +7,6 @@ ExternalIdentifier) - class TestIdentifier(TestCase): def setUp(self): self.identifier = { From 44d34002baa955dd268b5394b428fa8636e5ee64 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:40:27 -0400 Subject: [PATCH 812/877] validate pydantic attribute --- mavecore/models/sequence.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index 293552b..3ab5c04 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -10,3 +10,7 @@ class WildType(BaseModel): @validator('sequenceType') def validate_category(cls, v): target.validate_sequence_category(v) + + @validator('sequence') + def validate_sequence(cls, v): + target.validate_target_sequence(v) From 7eb630319eaa2a542571a4a87d8914864999157a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 17 Oct 2022 16:40:51 -0400 Subject: [PATCH 813/877] update attribute values to reflect changes in validation code --- tests/models/sequence.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/sequence.py b/tests/models/sequence.py index 91fc9d2..6314cb1 100644 --- a/tests/models/sequence.py +++ b/tests/models/sequence.py @@ -7,14 +7,14 @@ class Test(TestCase): def test_valid_all_fields(self): sequence = { "sequenceType": "Protein", - "sequence": "ATCG", + "sequence": "ATC", } WildType.parse_obj(sequence) def test_invalid_sequence_type(self): sequence = { "sequenceType": "RNA", - "sequence": "ATCG", + "sequence": "ATC", } with self.assertRaises(ValidationError): WildType.parse_obj(sequence) From 3948245982b8af8f22a65b3dbe3cee1f2f2e3480 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 11:52:28 -0700 Subject: [PATCH 814/877] add target test cases for valid inputs --- tests/validation/target.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/validation/target.py b/tests/validation/target.py index 6b04d62..8146ca2 100644 --- a/tests/validation/target.py +++ b/tests/validation/target.py @@ -5,14 +5,19 @@ class TestValidateTargetCategory(TestCase): def test_valid(self): - pass + for category in valid_categories: + validate_target_category(category) class TestValidateSequenceCategory(TestCase): def test_valid(self): - pass + for sequence_type in valid_sequence_types: + validate_sequence_category(sequence_type) class TestValidateTargetSequence(TestCase): + def setUp(self): + self.target_seq = "ATGACCAAACAT" + def test_valid(self): - pass + validate_target_sequence(self.target_seq) From bb0fc559ac02ede8d081e18f0193c663c6ea702e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 11:52:40 -0700 Subject: [PATCH 815/877] edit imports --- tests/validation/target.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/validation/target.py b/tests/validation/target.py index 8146ca2..213ba81 100644 --- a/tests/validation/target.py +++ b/tests/validation/target.py @@ -1,6 +1,8 @@ from unittest import TestCase from mavecore.validation.target import * +from mavecore.validation.exceptions import ValidationError +from mavecore.validation.constants.target import valid_categories, valid_sequence_types class TestValidateTargetCategory(TestCase): From 2f53b64036dac325e849b0a5c09c82ceb20eeff6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 11:54:51 -0700 Subject: [PATCH 816/877] define target test cases for invalid inputs --- tests/validation/target.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/validation/target.py b/tests/validation/target.py index 213ba81..ed518a3 100644 --- a/tests/validation/target.py +++ b/tests/validation/target.py @@ -10,12 +10,24 @@ def test_valid(self): for category in valid_categories: validate_target_category(category) + def test_invalid_category(self): + pass + + def test_invalid_case(self): + pass + class TestValidateSequenceCategory(TestCase): def test_valid(self): for sequence_type in valid_sequence_types: validate_sequence_category(sequence_type) + def test_invalid_category(self): + pass + + def test_invalid_case(self): + pass + class TestValidateTargetSequence(TestCase): def setUp(self): @@ -23,3 +35,12 @@ def setUp(self): def test_valid(self): validate_target_sequence(self.target_seq) + + def test_invalid_characters(self): + pass + + def test_invalid_case(self): + pass + + def test_invalid_length(self): + pass From fbb205f8ffa3b0e743e663569498023633ebab2b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 12:00:10 -0700 Subject: [PATCH 817/877] implement target test cases for invalid inputs --- tests/validation/target.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/validation/target.py b/tests/validation/target.py index ed518a3..dc1500a 100644 --- a/tests/validation/target.py +++ b/tests/validation/target.py @@ -11,10 +11,12 @@ def test_valid(self): validate_target_category(category) def test_invalid_category(self): - pass + with self.assertRaises(ValidationError): + validate_target_category("Protein") def test_invalid_case(self): - pass + with self.assertRaises(ValidationError): + validate_target_category("protein coding") class TestValidateSequenceCategory(TestCase): @@ -23,10 +25,12 @@ def test_valid(self): validate_sequence_category(sequence_type) def test_invalid_category(self): - pass + with self.assertRaises(ValidationError): + validate_sequence_category("RNA") def test_invalid_case(self): - pass + with self.assertRaises(ValidationError): + validate_sequence_category("dna") class TestValidateTargetSequence(TestCase): @@ -37,10 +41,15 @@ def test_valid(self): validate_target_sequence(self.target_seq) def test_invalid_characters(self): - pass + self.target_seq = "AUGACCAAACAU" + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq) def test_invalid_case(self): - pass + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq.lower()) def test_invalid_length(self): - pass + self.target_seq = self.target_seq + "A" + with self.assertRaises(ValidationError): + validate_target_sequence(self.target_seq) From acfe06b354c56dad6d2b2a6701beb078584071ec Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:09:08 -0700 Subject: [PATCH 818/877] update imports --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 0b01aa0..6846a5a 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -3,8 +3,8 @@ from typing import Optional from mavehgvs.variant import Variant -from mavecore.validation.variant import validate_hgvs_string from mavecore.validation.constants.conversion import codon_dict_DNA +from mavecore.validation.constants.conversion import aa_dict_key_1 def is_null(value): From 8663086445a0bb5a6348234bcef0acc02b1fffc1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:09:31 -0700 Subject: [PATCH 819/877] edit is_null implementation to return boolean values --- mavecore/validation/utilities.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 6846a5a..3ba4329 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -22,7 +22,13 @@ def is_null(value): True value is NoneType or if value matches the stated regex patterns in constants.null_values_re. """ value = str(value).strip().lower() - return null_values_re.fullmatch(value) or not value + if not value: + return True + match = null_values_re.fullmatch(value) + if match: + return True + else: + return False def generate_hgvs(prefix: str = "c") -> str: From c14977d68f1b1fca1fe8694f8293d1e4738a06c3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:09:59 -0700 Subject: [PATCH 820/877] update documentation for construct_hgvs_pro function --- mavecore/validation/utilities.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 3ba4329..9a377f1 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -60,15 +60,15 @@ def generate_hgvs(prefix: str = "c") -> str: def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional[str] = None): """ - Given the wt and mutant codons as well as the position, this function generates a validated + Given the wt and mutant 3 lette amino acid codes as well as the position, this function generates a validated hgvs_pro string. Parameters __________ wt: str - The wt codon. + The wt 3 letter amino acid code. mutant: str - The mutant codon. + The mutant 3 letter amino acid code. position: int The position of the change. From df7bec39160b1a6f566731eac306fcbfe4b50aa0 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:10:39 -0700 Subject: [PATCH 821/877] update implementation for construct_hgvs_pro function, raise errors when invalid aa codes are provided --- mavecore/validation/utilities.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 9a377f1..1fe1ef1 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -78,6 +78,14 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional The constructed hgvs_pro string. """ # TODO account for when variant codon is None, a deletion event + # check that the provided 3 letter amino acid codes are valid + if wt not in aa_dict_key_1.values(): + raise ValueError("wt 3 letter amino acid code {} is invalid, " + "must be one of the following: {}".format(wt, list(aa_dict_key_1.values()))) + if mutant not in aa_dict_key_1.values(): + raise ValueError("wt 3 letter amino acid code {} is invalid, " + "must be one of the following: {}".format(mutant, list(aa_dict_key_1.values()))) + if wt == mutant: hgvs = "p." + wt + str(position) + "=" else: From 6d41bc482009b9a4e0708ab144bef376d50d1a74 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:10:54 -0700 Subject: [PATCH 822/877] edit imports --- tests/validation/utilities.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index fc13448..2b3cb93 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -1,5 +1,8 @@ from unittest import TestCase +from mavecore.validation.constants.general import null_values_list +from mavecore.validation.variant import validate_pro_variant, validate_nt_variant + from mavecore.validation.utilities import ( is_null, generate_hgvs, From d5512e0662c7fcc46b7f5eeabe0fe814e205e718 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:10:59 -0700 Subject: [PATCH 823/877] edit imports --- tests/validation/utilities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 2b3cb93..e183d2e 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -8,10 +8,10 @@ generate_hgvs, construct_hgvs_pro, convert_hgvs_nt_to_hgvs_pro, - is_wild_type, - is_deletion, - is_substitution_one_base, - is_substitution_two_bases_nonadjacent + _is_wild_type, + _is_deletion, + _is_substitution_one_base, + _is_substitution_two_bases_nonadjacent ) From 48463c9778ec5f4a71b8cd770be6810b98ae5a08 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:11:15 -0700 Subject: [PATCH 824/877] implement is_null test methods --- tests/validation/utilities.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index e183d2e..8dd8c5c 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -16,11 +16,13 @@ class TestIsNull(TestCase): - def valid_null_values(self): - pass + def test_valid_null_values(self): + for value in null_values_list: + self.assertTrue(is_null(value)) - def invalid_null_values(self): - pass + def test_invalid_null_values(self): + self.assertFalse(is_null(1)) + self.assertFalse(is_null("1")) class TestGenerateHgvsPro(TestCase): From f2083ea2dd490fa47c7299baa41c1b27e399375d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:11:56 -0700 Subject: [PATCH 825/877] implement test methods for utility test cases --- tests/validation/utilities.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 8dd8c5c..0466a88 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -27,21 +27,25 @@ def test_invalid_null_values(self): class TestGenerateHgvsPro(TestCase): def test_pro(self): - pass + pro = generate_hgvs("p") + validate_pro_variant(pro) def test_nt(self): - pass + nt = generate_hgvs() + validate_nt_variant(nt) class TestConstructHgvsPro(TestCase): - def valid_arguments(self): - pass + def test_valid_arguments(self): + construct_hgvs_pro(wt="Ala", mutant="Gly", position=3) - def invalid_wt_aa(self): - pass + def test_invalid_wt_aa(self): + with self.assertRaises(ValueError): + construct_hgvs_pro(wt="Alr", mutant="Gly", position=3) - def invalid_mut_aa(self): - pass + def test_invalid_mut_aa(self): + with self.assertRaises(ValueError): + construct_hgvs_pro(wt="Ala", mutant="Gla", position=3) def invalid_position(self): pass From ef303cd8f5089b33c0c22cb440b2c84001e6ca92 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:12:16 -0700 Subject: [PATCH 826/877] add TODO for test method in utility test cases --- tests/validation/utilities.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 0466a88..d45bf6b 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -47,8 +47,9 @@ def test_invalid_mut_aa(self): with self.assertRaises(ValueError): construct_hgvs_pro(wt="Ala", mutant="Gla", position=3) - def invalid_position(self): - pass + def test_invalid_position(self): + # TODO what are the invalid positions we should consider? + self.assertFalse(False) class TestConvertHgvsNtToHgvsPro(TestCase): From ee28eb0e6dedad0df21456e5b30e4dbe941032ef Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 13:12:50 -0700 Subject: [PATCH 827/877] correct invalid function signatures for utility test cases --- tests/validation/utilities.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index d45bf6b..34042c0 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -53,35 +53,35 @@ def test_invalid_position(self): class TestConvertHgvsNtToHgvsPro(TestCase): - def invalid_hgvs_nt(self): + def test_invalid_hgvs_nt(self): pass - def wt_hgvs_nt(self): + def test_wt_hgvs_nt(self): pass - def deletion_hgvs_nt(self): + def test_deletion_hgvs_nt(self): pass - def one_base_change_codon_variant(self): + def test_one_base_change_codon_variant(self): pass - def two_base_change_codon_variant(self): + def test_two_base_change_codon_variant(self): pass - def three_base_change_codon_variant(self): + def test_three_base_change_codon_variant(self): pass class TestVariantTypeHelperFunctions(TestCase): - def test_is_wild_type(self): + def test_test_is_wild_type(self): pass - def is_deletion(self): + def test_is_deletion(self): pass - def test_is_substitution_one_base(self): + def test_test_is_substitution_one_base(self): pass - def test_is_substitution_two_bases_nonadjacent(self): + def test_test_is_substitution_two_bases_nonadjacent(self): pass From 1beb259052c473f4694818a7291a9c5630da2895 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 24 Oct 2022 14:59:38 -0700 Subject: [PATCH 828/877] mark TODO, do not apply this function until it has been fully tested --- mavecore/validation/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 815712c..37213f1 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -245,11 +245,12 @@ def validate_values_by_column(dataset, target_seq: str): s = validate_score(dataset.loc[i, required_score_column]) dataset.loc[i, required_score_column] = s if hgvs_nt and hgvs_pro: + # TODO: ensure this function is implemented correctly before applying, complete unit testing if not Variant(dataset.loc[i, hgvs_pro_column]).is_multi_variant(): # can only convert to single hgvs_pro variants - validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, + '''validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, nt=dataset.loc[i, hgvs_nt_column], pro=dataset.loc[i, hgvs_pro_column], - row=i) + row=i)''' # check that primary column, whether hgvs_nt or hgvs_pro, does not contain None values # make sure target seq is the right type From 22ba97d5902e045f3f292f2d16321d3adcdc2c1d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:55:46 -0700 Subject: [PATCH 829/877] remove todo --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 37213f1..d27d947 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -48,6 +48,7 @@ def validate_dataframes(target_seq: str, scores, counts=None): def validate_no_null_columns_or_rows(dataframe): + # TODO: we may not need this - current datasets exist where all values are None """ Checks that there are no null columns or rows in the dataframe. Note that a null column may still have a valid column name. From ba4569d48deaa9a63a8dc55fa6bbc8c11805da02 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:56:12 -0700 Subject: [PATCH 830/877] reimplement to cover cases where None values are allowed in columns --- mavecore/validation/dataframe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index d27d947..8cc6292 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -63,6 +63,13 @@ def validate_no_null_columns_or_rows(dataframe): ValidationError If there are null columns or rows in the dataframe """ + # first drop any columns where null columns are allowed + if hgvs_nt_column: + dataframe = dataframe.drop([hgvs_nt_column], axis=1) + if hgvs_pro_column: + dataframe = dataframe.drop([hgvs_pro_column], axis=1) + if hgvs_splice_column: + dataframe = dataframe.drop([hgvs_splice_column], axis=1) df = dataframe.dropna(axis=0, how='all') df = df.dropna(axis=1, how='all') try: From afb1658dd634bc334c8a478d910f2ac094d5065d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:56:34 -0700 Subject: [PATCH 831/877] remove TODO --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 8cc6292..c63fc7f 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -79,6 +79,7 @@ def validate_no_null_columns_or_rows(dataframe): def validate_column_names(dataframe, scores=True): + # TODO: return errors to user regarding column name ordering """ This function validates the columns in a dataframe. The first columns should be an hgvs column such as hgvs_nt, hgvs_pro, and hgvs_splice. There should be at least From bccec5e5fd592f408e2737e772b24fb651206746 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:57:04 -0700 Subject: [PATCH 832/877] remove TODO and check that there are no duplicates in column names --- mavecore/validation/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index c63fc7f..c9c2732 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -100,7 +100,11 @@ def validate_column_names(dataframe, scores=True): """ # get columns from dataframe columns = dataframe.columns - # TODO do one of either hgvs_pro and hgvs_nt have to be present? + + # check that there are no duplicate column names + if len(columns) != len(set(columns)): + raise ValidationError("There cannot be duplicate column names.") + # count instances of hgvs columns count = 0 # note presence of different columns From 965d6208f065d962b3ccf3e24ccef6da3c9a518a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:57:30 -0700 Subject: [PATCH 833/877] reimplement, change list of invalid column names --- mavecore/validation/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index c9c2732..b4b4179 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -114,7 +114,10 @@ def validate_column_names(dataframe, scores=True): score_column = False for i in range(len(columns)): # there should not be any null columns - if is_null(columns[i]) or columns[i] is None: + # check for empty strings, np.nan, and None + # if is_null(columns[i]) or columns[i] is None: + if isinstance(columns[i], str) or columns[i] == "" or columns[i].isspace(): + # above condition will check that value is not None or np.nan also raise ValidationError("Column names must not be null.") # in readable_null_values_list: if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: count += 1 From 81513045627eca16e3eecdd36e85d29fdceda35e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:57:38 -0700 Subject: [PATCH 834/877] delete comments --- mavecore/validation/dataframe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index b4b4179..01b782c 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -159,9 +159,6 @@ def validate_column_names(dataframe, scores=True): if hgvs_nt: nt_column = dataframe.pop(hgvs_nt_column) dataframe.insert(0, hgvs_nt_column, nt_column) - #for i in range(count): - # if columns[i] not in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: - # raise ValidationError("First columns must be hgvs columns.") # there should be at least one additional column beyond the hgvs columns if len(columns) == count: From b1f18ebc9aa51dcfab8d78eadcb621e38ab91f60 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:58:19 -0700 Subject: [PATCH 835/877] reimplement, consider scenarios where column values can take None --- mavecore/validation/dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 01b782c..2ca9357 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -232,7 +232,7 @@ def validate_values_by_column(dataset, target_seq: str): # loop through row by row, validate hgvs strings, make sure nt and pro are consistent with one another for i in range(len(dataset)): - if hgvs_nt: + if hgvs_nt and dataset.loc[i, hgvs_nt_column] is not None: validate_hgvs_string(value=dataset.loc[i, hgvs_nt_column], column="nt", targetseq=target_seq, @@ -242,12 +242,12 @@ def validate_values_by_column(dataset, target_seq: str): raise ValidationError("All prefixes within the hgvs_nt column must be the same.") else: # assign the prefix value since it has not yet been assigned hgvs_nt_prefix = Variant(dataset.loc[i, hgvs_nt_column]).prefix - if hgvs_pro: + if hgvs_pro and dataset.loc[i, hgvs_pro_column] is not None: validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], column="p", targetseq=target_seq, splice_present=hgvs_splice) - if hgvs_splice: + if hgvs_splice and dataset.loc[i, hgvs_splice_column] is not None: validate_hgvs_string(value=dataset.loc[i, hgvs_splice_column], column="splice", targetseq=target_seq, From 21b2469d07bb79c0c8c950b73dc17a3ad8ca854c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:58:52 -0700 Subject: [PATCH 836/877] consider case where hgvs_pro should not take value when hgvs_nt is noncoding --- mavecore/validation/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 2ca9357..460483f 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -240,7 +240,11 @@ def validate_values_by_column(dataset, target_seq: str): if hgvs_nt_prefix: if Variant(dataset.loc[i, hgvs_nt_column]).prefix != hgvs_nt_prefix: raise ValidationError("All prefixes within the hgvs_nt column must be the same.") - else: # assign the prefix value since it has not yet been assigned + # if prefix is non-coding, there should not be an hgvs_pro value + if hgvs_nt_prefix == "n": + if hgvs_pro_column and dataset.loc[i, hgvs_pro_column] is not None: + raise ValidationError("Cannot have hgvs_pro value with non-coding hgvs_nt value.") + else: # assign the prefix value since it has not yet been assigned hgvs_nt_prefix = Variant(dataset.loc[i, hgvs_nt_column]).prefix if hgvs_pro and dataset.loc[i, hgvs_pro_column] is not None: validate_hgvs_string(value=dataset.loc[i, hgvs_pro_column], From 723a66629d1ad27d386667c82a847e6236b1de6a Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:59:18 -0700 Subject: [PATCH 837/877] consider scenario where column values can take None --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 460483f..c8834e5 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -261,7 +261,7 @@ def validate_values_by_column(dataset, target_seq: str): if score: s = validate_score(dataset.loc[i, required_score_column]) dataset.loc[i, required_score_column] = s - if hgvs_nt and hgvs_pro: + if hgvs_nt and hgvs_pro and dataset.loc[i, hgvs_nt_column] is not None and dataset.loc[i, hgvs_pro_column] is not None: # TODO: ensure this function is implemented correctly before applying, complete unit testing if not Variant(dataset.loc[i, hgvs_pro_column]).is_multi_variant(): # can only convert to single hgvs_pro variants '''validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq=target_seq, From 95691bf906f9aba3fe5329ad00daf44a9b934e38 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:59:42 -0700 Subject: [PATCH 838/877] reimplement validate score, only consider float and int as valid --- mavecore/validation/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index c8834e5..f37b190 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -305,10 +305,9 @@ def validate_index_column(column, hgvs: str): def validate_score(score): - # TODO we probably dont need this - try: + if isinstance(score, float) or isinstance(score, int): score = float(score) - except ValueError: + else: raise ValidationError("Each value in score column must by a float. " "'{}' has the type '{}'.".format(score, type(score).__name__)) return score From e4cbc70366368d408ca47d34d25fc45e8ae19cfe Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 14:59:54 -0700 Subject: [PATCH 839/877] add TODO --- mavecore/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index f37b190..8e4e996 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -334,6 +334,7 @@ def validate_hgvs_nt_and_hgvs_pro_represent_same_change(target_seq: str, nt: str ValidationError If the variants do not represent the same change. """ + # TODO think about how double quoted variants are handled here (e.g., "c.[123A>G;124A>G]") nt_converted = convert_hgvs_nt_to_hgvs_pro(nt, target_seq) # compare nt_converted with pro if nt_converted != pro: From 59adeab1335fd0d93c9b9ca18c865e18e2876dac Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:00:02 -0700 Subject: [PATCH 840/877] add import --- tests/validation/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 708cdf1..62f13de 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -1,6 +1,7 @@ from unittest import TestCase import numpy as np import pandas as pd +from io import StringIO from mavecore.validation.exceptions import ValidationError From 212fd374d0ec8a025aed0092f8e10db809431110 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:00:13 -0700 Subject: [PATCH 841/877] reformat --- tests/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 62f13de..341dcf7 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -89,7 +89,7 @@ def test_missing_hgvs_column(self): def test_hgvs_in_wrong_location(self): self.dataframe = self.dataframe[[hgvs_nt_column, required_score_column, hgvs_pro_column, hgvs_splice_column]] - validate_column_names(self.dataframe) # validation fixes problem, should pass + validate_column_names(self.dataframe) # validation fixes problem, should pass def test_no_additional_columns_beyond_hgvs_scores_df(self): self.dataframe = self.dataframe.drop([hgvs_pro_column, hgvs_splice_column, required_score_column], axis=1) From 64950ac93310b9279e2facfe5cbdb4cb489deea2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:00:27 -0700 Subject: [PATCH 842/877] add case for testing duplicate column names --- tests/validation/dataframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 341dcf7..2e2dfd1 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -106,6 +106,11 @@ def test_hgvs_columns_must_be_lowercase(self): with self.assertRaises(ValueError): validate_column_names(self.dataframe) + def test_duplicate_column_names(self): + self.dataframe.rename(columns={hgvs_pro_column: hgvs_nt_column}, inplace=True) + with self.assertRaises(ValidationError): + validate_column_names(self.dataframe) + def test_null_column_name(self): for value in null_values_list: self.dataframe.rename(columns={hgvs_splice_column: value}, inplace=True) From c8b83cb62a56f2c55b92e96e8ca34ee1cd50d050 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:00:45 -0700 Subject: [PATCH 843/877] add new list of null values that a column name can take --- tests/validation/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 2e2dfd1..05b5f0c 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -112,7 +112,8 @@ def test_duplicate_column_names(self): validate_column_names(self.dataframe) def test_null_column_name(self): - for value in null_values_list: + null_values = [None, np.nan, "", 1, " "] + for value in null_values: self.dataframe.rename(columns={hgvs_splice_column: value}, inplace=True) with self.assertRaises(ValidationError): validate_column_names(self.dataframe) From 1d7fdd0c9dce72934198e7d48a985e230b3d7f13 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:00:59 -0700 Subject: [PATCH 844/877] delete redundant code --- tests/validation/dataframe.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 05b5f0c..44dca69 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -219,25 +219,6 @@ def test_does_not_allow_sy(self): with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) - # TODO this should be handled by pandas - def test_data_method_converts_null_values_to_None(self): - '''hgvs = generate_hgvs() - for value in constants.null_values_list: - with self.subTest(msg=value): - data = "{},{}\n{},{}".format( - self.HGVS_NT_COL, self.SCORE_COL, hgvs, value - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - - df = dataset.data(serializable=True) - self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) - self.assertIsNone(df[self.SCORE_COL].values[0])''' - - # TODO not sure if we want to do this def test_parses_numeric_column_values_into_float(self): self.dataframe.loc[0, [required_score_column]] = "1.1" self.assertTrue(type(self.dataframe[required_score_column][0]) == str) From 975d47470f9259d8869533aef53b6f526e07415b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:01:34 -0700 Subject: [PATCH 845/877] edit test case to check for error when string is passes as score value, even though it can be cast to float --- tests/validation/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 44dca69..d4e0f04 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -222,8 +222,9 @@ def test_does_not_allow_sy(self): def test_parses_numeric_column_values_into_float(self): self.dataframe.loc[0, [required_score_column]] = "1.1" self.assertTrue(type(self.dataframe[required_score_column][0]) == str) - validate_values_by_column(self.dataframe, target_seq=self.target_seq) - self.assertTrue(type(self.dataframe[required_score_column][0]) == float) + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.assertFalse(type(self.dataframe[required_score_column][0]) == float) self.dataframe.loc[0, [required_score_column]] = 1 self.assertTrue(type(self.dataframe[required_score_column][0]) == int) validate_values_by_column(self.dataframe, target_seq=self.target_seq) From 66a19163a50a119f537f071e6d30c95dc2389cd2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:02:12 -0700 Subject: [PATCH 846/877] remove unneeded test case --- tests/validation/dataframe.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index d4e0f04..1e88a7b 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -230,6 +230,7 @@ def test_parses_numeric_column_values_into_float(self): validate_values_by_column(self.dataframe, target_seq=self.target_seq) self.assertTrue(type(self.dataframe[required_score_column][0]) == float) + # TODO: validate hgvs string should check this def test_does_not_split_double_quoted_variants(self): '''hgvs = "c.[123A>G;124A>G]" data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) @@ -248,22 +249,6 @@ def test_does_not_split_double_quoted_variants(self): # with self.assertRaises(ValidationError): # _ = validate_variant_rows(BytesIO(data.encode()))''' - # TODO may not need to include this - def test_df_indexed_by_primary_column(self): - '''data = "{},{},{}\n{},{},1.0".format( - self.HGVS_NT_COL, - self.HGVS_PRO_COL, - self.SCORE_COL, - generate_hgvs(prefix="c"), - generate_hgvs(prefix="p"), - ) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - assert_index_equal(dataset.data().index, dataset.index)''' - def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) From e9de63e4f00bb36ffb723af683eb497184abeeea Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:02:58 -0700 Subject: [PATCH 847/877] add test cases to check value in hgvs_pro column depending on prefix of hgvs_nt column --- tests/validation/dataframe.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 1e88a7b..57a0190 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -262,6 +262,27 @@ def test_invalid_nt_not_genomic_when_splice_present(self): with self.assertRaises(ValidationError): validate_values_by_column(self.dataframe, target_seq=self.target_seq) + def test_noncoding_hgvs_nt_should_not_have_hgvs_pro_columns(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "n.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "n.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "n.6A>G" + with self.assertRaises(ValidationError): + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.dataframe.loc[0, [hgvs_pro_column]] = None + self.dataframe.loc[1, [hgvs_pro_column]] = None + self.dataframe.loc[2, [hgvs_pro_column]] = None + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + + def test_coding_hgvs_nt_may_have_hgvs_pro_columns(self): + self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) + self.dataframe.loc[0, [hgvs_nt_column]] = "c.4A>G" + self.dataframe.loc[1, [hgvs_nt_column]] = "c.5C>G" + self.dataframe.loc[2, [hgvs_nt_column]] = "c.6A>G" + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + self.dataframe = self.dataframe.drop([hgvs_pro_column], axis=1) + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + def test_invalid_splice_not_defined_when_nt_is_genomic(self): self.dataframe = self.dataframe.drop([hgvs_splice_column], axis=1) with self.assertRaises(ValidationError): From 547e607c046ea0600ddd7e7100f122ff0e9f707f Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:03:18 -0700 Subject: [PATCH 848/877] implement test to check that 0 is not parsed as None --- tests/validation/dataframe.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index 57a0190..ec417ae 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -289,15 +289,13 @@ def test_invalid_splice_not_defined_when_nt_is_genomic(self): validate_values_by_column(self.dataframe, target_seq=self.target_seq) def test_invalid_zero_is_not_parsed_as_none(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 0)''' + self.dataframe.loc[0, [required_score_column]] = 0.0 + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + hgvs = "c.4A>G" + data = "{},{}\n{},0.0".format(hgvs_nt_column, required_score_column, hgvs) + df = pd.read_csv(StringIO(data), sep=",") + validate_values_by_column(df, target_seq=self.target_seq) + self.assertEqual(df[required_score_column].values[0], 0) def test_invalid_close_to_zero_is_not_parsed_as_none(self): '''hgvs = generate_hgvs(prefix="c") From 3663669079d0be6f30ad58af505198620fc2271d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:03:32 -0700 Subject: [PATCH 849/877] implement test to check that close to 0 is not parsed as None --- tests/validation/dataframe.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index ec417ae..f48941d 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -298,15 +298,13 @@ def test_invalid_zero_is_not_parsed_as_none(self): self.assertEqual(df[required_score_column].values[0], 0) def test_invalid_close_to_zero_is_not_parsed_as_none(self): - '''hgvs = generate_hgvs(prefix="c") - data = "{},{}\n{},5.6e-15".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) - - dataset = MaveDataset.for_scores(StringIO(data)) - dataset.validate() - - self.assertTrue(dataset.is_valid) - df = dataset.data() - self.assertEqual(df[self.SCORE_COL].values[0], 5.6e-15)''' + self.dataframe.loc[0, [required_score_column]] = 5.6e-15 + validate_values_by_column(self.dataframe, target_seq=self.target_seq) + hgvs = "c.4A>G" + data = "{},{}\n{},5.6e-15".format(hgvs_nt_column, required_score_column, hgvs) + df = pd.read_csv(StringIO(data), sep=",") + validate_values_by_column(df, target_seq=self.target_seq) + self.assertEqual(df[required_score_column].values[0], 5.6e-15) def test_mismatched_variants_and_column_names(self): self.dataframe = pd.DataFrame( From 5f5221521c020e525ba9171c91f7619b67ffaa22 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:03:56 -0700 Subject: [PATCH 850/877] edit docstring --- mavecore/validation/utilities.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 1fe1ef1..17156fa 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -76,6 +76,11 @@ def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional _______ hgvs The constructed hgvs_pro string. + + Raises + ______ + ValueError + If the wt or mutant 3 letter amino acid codes are invalid. """ # TODO account for when variant codon is None, a deletion event # check that the provided 3 letter amino acid codes are valid From 1be417327a0ca82fe3d0f251af1cd01cb86c47aa Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:04:28 -0700 Subject: [PATCH 851/877] reformat --- mavecore/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 17156fa..3986f68 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -150,7 +150,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): # now that we have the variant_position, get codon_number codon_number = round((variant_position / 3) + 0.5) # use codon_number to get target_codon from target_seq - target_codon = target_seq[(codon_number - 1) * 3 : codon_number * 3] + target_codon = target_seq[(codon_number - 1) * 3: codon_number * 3] # declare variables for codon data # keep track of the number and location of the changes within the codon From 69627aaba7ce905a9d90ba3b8af41f142b464853 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:05:59 -0700 Subject: [PATCH 852/877] add notes to improve utility functions --- mavecore/validation/utilities.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 3986f68..b5e0b84 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -5,6 +5,7 @@ from mavehgvs.variant import Variant from mavecore.validation.constants.conversion import codon_dict_DNA from mavecore.validation.constants.conversion import aa_dict_key_1 +#from mavecore.validation.variant import validate_hgvs_string def is_null(value): @@ -59,6 +60,7 @@ def generate_hgvs(prefix: str = "c") -> str: def construct_hgvs_pro(wt: str, mutant: str, position: int, target_seq: Optional[str] = None): + # TODO: the testing on this function needs to be improved """ Given the wt and mutant 3 lette amino acid codes as well as the position, this function generates a validated hgvs_pro string. @@ -121,6 +123,11 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): ValueError If target_seq is not made solely of characters ACTG. """ + # check that the hgvs_nt variant is valid with regards to the target sequence + #validate_hgvs_string(value=hgvs_nt, + # column="nt", + # targetseq=target_seq) + # check for TypeError # if target_seq is not string if not isinstance(target_seq, str): @@ -263,6 +270,7 @@ def convert_hgvs_nt_to_hgvs_pro(hgvs_nt: str, target_seq: str): def _is_wild_type(hgvs: str): + # TODO this is no longer valid """ This function takes an hgvs formatted string and returns True if the hgvs string indicates there was no change from the target sequence. From 17c44ec057483a14f2476f1be9e7099c9486476e Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:06:11 -0700 Subject: [PATCH 853/877] edit imports --- tests/validation/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 34042c0..7a4d7e5 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -1,7 +1,7 @@ from unittest import TestCase from mavecore.validation.constants.general import null_values_list -from mavecore.validation.variant import validate_pro_variant, validate_nt_variant +from mavecore.validation.variant import validate_hgvs_string #validate_pro_variant, validate_nt_variant from mavecore.validation.utilities import ( is_null, From 9df14734387feacaaac9f8597cdfe841f4770e28 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:06:38 -0700 Subject: [PATCH 854/877] update test cases --- tests/validation/utilities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 7a4d7e5..040de46 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -28,11 +28,11 @@ def test_invalid_null_values(self): class TestGenerateHgvsPro(TestCase): def test_pro(self): pro = generate_hgvs("p") - validate_pro_variant(pro) + validate_hgvs_string(pro) def test_nt(self): nt = generate_hgvs() - validate_nt_variant(nt) + validate_hgvs_string(nt) class TestConstructHgvsPro(TestCase): From 8a97908cc9dac382ef7d805347993a81a7926ba8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:07:02 -0700 Subject: [PATCH 855/877] add to outline of unittests --- tests/validation/utilities.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/validation/utilities.py b/tests/validation/utilities.py index 040de46..c5e0618 100644 --- a/tests/validation/utilities.py +++ b/tests/validation/utilities.py @@ -53,10 +53,16 @@ def test_invalid_position(self): class TestConvertHgvsNtToHgvsPro(TestCase): - def test_invalid_hgvs_nt(self): - pass + def setUp(self): + self.target_seq = "ATGACA" + self.hgvs_nt_values = ["g.4A>G", "g.5C>G", "g.6A>G"] + self.hgvs_pro_values = ["p.Thr2Ala", "p.Thr2Arg", "p.Thr2="] def test_wt_hgvs_nt(self): + #convert_hgvs_nt_to_hgvs_pro(hgvs_nt="g.4A>G", ) + pass + + def test_wt_hgvs_pro(self): pass def test_deletion_hgvs_nt(self): From 52652983f732b275db49e282782e14384f2bd0ad Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Fri, 4 Nov 2022 15:07:40 -0700 Subject: [PATCH 856/877] comment out partial function calls --- mavecore/validation/variant.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mavecore/validation/variant.py b/mavecore/validation/variant.py index adf785d..53e37e2 100644 --- a/mavecore/validation/variant.py +++ b/mavecore/validation/variant.py @@ -118,6 +118,6 @@ def validate_hgvs_string( return str(variant) -validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) -validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) -validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) +#validate_nt_variant = partial(validate_hgvs_string, **{"column": "nt"}) +#validate_splice_variant = partial(validate_hgvs_string, **{"column": "splice"}) +#validate_pro_variant = partial(validate_hgvs_string, **{"column": "p"}) From eb50e99c986e847eca7cd08703f8d0c2d52e2e20 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 12:17:15 -0800 Subject: [PATCH 857/877] add dataset validation --- mavecore/validation/dataset.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mavecore/validation/dataset.py diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py new file mode 100644 index 0000000..e69de29 From c362945b6adf6c50bd5093a9b729312f2ff94f15 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 12:31:07 -0800 Subject: [PATCH 858/877] create dictionary validation functions --- mavecore/validation/dataset.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index e69de29..b820d06 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -0,0 +1,15 @@ +from mavecore.models.data import Experiment, ScoreSet + + +def validate_experiment(experiment: dict): + try: + Experiment.parse_obj(experiment) + except ValueError as e: + print(e) + + +def validate_scoreset(scoreset: dict): + try: + ScoreSet.parse_obj(scoreset) + except ValueError as e: + print(e) From bbd43ebb0eee8d441f287f2b3412e9777e82b616 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:02:49 -0800 Subject: [PATCH 859/877] edit import --- mavecore/validation/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index b820d06..a1099d8 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -1,3 +1,5 @@ +import json + from mavecore.models.data import Experiment, ScoreSet From 6bd39f433788af25a2846006666fa87b39624b31 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:03:11 -0800 Subject: [PATCH 860/877] write documentation for scoreset and experiment dictionary validation --- mavecore/validation/dataset.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index a1099d8..e621106 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -4,6 +4,21 @@ def validate_experiment(experiment: dict): + """ + Validates an experiment represented as a dictionary. Validation is handled via pydantic. A valid dictionary is + returned upon validation. If extra or duplicate keys are included, those fields are excluded from the returned + dictionary. If required keys are missing or any keys contain incorrect values, an error is raised. + + Parameters + __________ + experiment : dict + The experiment dictionary that will be validated. + + Raises + ______ + ValueError + If required keys are missing or any keys contain incorrect values. + """ try: Experiment.parse_obj(experiment) except ValueError as e: @@ -11,6 +26,22 @@ def validate_experiment(experiment: dict): def validate_scoreset(scoreset: dict): + """ + Validates a scoreset represented as a dictionary (Note: this does not validate dataframes, look to dataframe.py + for that validation code). Validation is handled via pydantic. A valid dictionary is returned upon validation. + If extra or duplicate keys are included, those fields are excluded from the returned dictionary. If required keys + are missing or any keys contain incorrect values, an error is raised. + + Parameters + __________ + experiment : dict + The scoreset dictionary that will be validated. + + Raises + ______ + ValueError + If required keys are missing or any keys contain incorrect values. + """ try: ScoreSet.parse_obj(scoreset) except ValueError as e: From 789375148949b80acc90ca922e4bd1d79794429c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:03:57 -0800 Subject: [PATCH 861/877] return a validated dictionary from the dictionary validation functions --- mavecore/validation/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/validation/dataset.py b/mavecore/validation/dataset.py index e621106..716ae59 100644 --- a/mavecore/validation/dataset.py +++ b/mavecore/validation/dataset.py @@ -20,7 +20,7 @@ def validate_experiment(experiment: dict): If required keys are missing or any keys contain incorrect values. """ try: - Experiment.parse_obj(experiment) + return json.loads(Experiment.parse_obj(experiment).json()) except ValueError as e: print(e) @@ -43,6 +43,6 @@ def validate_scoreset(scoreset: dict): If required keys are missing or any keys contain incorrect values. """ try: - ScoreSet.parse_obj(scoreset) + return json.loads(ScoreSet.parse_obj(scoreset).json()) except ValueError as e: print(e) From 95e63f86d41e9ff758b8232ef6caf98ffe2bc5e1 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:17:07 -0800 Subject: [PATCH 862/877] create test file for dataset validation --- tests/validation/dataset.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/validation/dataset.py diff --git a/tests/validation/dataset.py b/tests/validation/dataset.py new file mode 100644 index 0000000..e69de29 From e8a6acd3bf00f7e3229c1d6d036d2c530099dbe2 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Tue, 8 Nov 2022 14:21:30 -0800 Subject: [PATCH 863/877] outline dataset test cases --- tests/validation/dataset.py | 79 +++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/validation/dataset.py b/tests/validation/dataset.py index e69de29..ccea630 100644 --- a/tests/validation/dataset.py +++ b/tests/validation/dataset.py @@ -0,0 +1,79 @@ +from unittest import TestCase +from mavecore.validation.dataset import validate_experiment, validate_scoreset + + +class TestValidateExperiment(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + self.experiment = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "keywords": ["string"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + } + + def test_valid_all_fields(self): + validate_experiment(self.experiment) + '''try: + print(type(json.loads(Experiment.parse_obj(self.experiment).json()))) + #print(a.json()) + + #b = dict() + #print(b.json()) + except ValueError as e: + print(e)''' + + def test_valid_exclude_optional(self): + self.experiment.pop("extraMetadata") + self.experiment.pop("keywords") + self.experiment.pop("doiIdentifiers") + self.experiment.pop("pubmedIdentifiers") + validate_experiment(self.experiment) + + +class TestValidateScoreSet(TestCase): + def setUp(self): + doi_identifier = {"identifier": "10.1038/s41588-018-0122-z"} + pubmed_identifier = {"identifier": "29785012"} + reference_map = {"genomeId": 0, "targetId": 0} + sequence = {"sequenceType": "DNA", "sequence": "ATC"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} + external_identifier = {"identifier": external_identifier_id, "offset": 0} + target = {"name": "name", + "category": "Protein coding", + "externalIdentifiers": [external_identifier], + "referenceMaps": [reference_map], + "wtSequence": sequence} + self.scoreset = { + "title": "title", + "shortDescription": "short description", + "abstractText": "abstract", + "methodText": "methods", + "extraMetadata": {}, + "dataUsagePolicy": "policy", + "licenceId": 0, + "keywords": ["string"], + "experimentUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "supersededScoresetUrn": "tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5", + "metaAnalysisSourceScoresetUrns": ["tmp:0a56b8eb-8e19-4906-8cc7-d17d884330a5"], + "doiIdentifiers": [doi_identifier], + "pubmedIdentifiers": [pubmed_identifier], + "targetGene": target, + } + + def test_valid_all_fields(self): + validate_scoreset(self.scoreset) + + def test_valid_exclude_optional(self): + self.scoreset.pop("extraMetadata") + self.scoreset.pop("keywords") + self.scoreset.pop("doiIdentifiers") + self.scoreset.pop("pubmedIdentifiers") + self.scoreset.pop("supersededScoresetUrn") + self.scoreset.pop("metaAnalysisSourceScoresetUrns") + validate_scoreset(self.scoreset) From f90f07245d32a21b69ddeb5df30aecc9c8960150 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Nov 2022 10:58:12 -0800 Subject: [PATCH 864/877] update dbname --- tests/models/identifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/identifier.py b/tests/models/identifier.py index 8f87d73..006630b 100644 --- a/tests/models/identifier.py +++ b/tests/models/identifier.py @@ -49,7 +49,7 @@ def test_invalid_type_of_identifier(self): class TestExternalIdentifier(TestCase): def setUp(self): - self.external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} + self.external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} self.external_identifier = {"identifier": self.external_identifier_id, "offset": 0} """def test_valid_external_identifier_id(self): From 80efe631cd149b8d38282565d122ab57bff6defd Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Nov 2022 10:59:29 -0800 Subject: [PATCH 865/877] update targetseq and dbname to pass unittests --- tests/models/target.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/target.py b/tests/models/target.py index e6deb5f..62ace8b 100644 --- a/tests/models/target.py +++ b/tests/models/target.py @@ -6,8 +6,8 @@ class TestTargetGene(TestCase): def setUp(self): reference_map = {"genomeId": 0, "targetId": 0} - sequence = {"sequenceType": "Protein", "sequence": "ATCG"} - external_identifier_id = {"dbname": "uniprot", "identifier": "P01133"} + sequence = {"sequenceType": "Protein", "sequence": "ATCGAA"} + external_identifier_id = {"dbname": "UniProt", "identifier": "P01133"} external_identifier = {"identifier": external_identifier_id, "offset": 0} self.target = {"name": "name", "category": "Protein coding", From 2d4ab8424c65f7da55614107d1fa6ec0a4a31ab3 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:04:46 -0800 Subject: [PATCH 866/877] update validate null columns or rows tests to reflect changes in validation function --- tests/validation/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/validation/dataframe.py b/tests/validation/dataframe.py index f48941d..7d0117a 100644 --- a/tests/validation/dataframe.py +++ b/tests/validation/dataframe.py @@ -32,6 +32,7 @@ def setUp(self): hgvs_nt_column: ["c.1A>G"], hgvs_pro_column: ["p.Leu5Glu"], hgvs_splice_column: ["c.1A>G"], + required_score_column: 1.0, } ) @@ -39,12 +40,12 @@ def test_valid(self): validate_no_null_columns_or_rows(self.dataframe) def test_null_row(self): - self.dataframe.loc[1] = [np.nan, np.nan, np.nan] + self.dataframe.loc[1] = [np.nan, np.nan, np.nan, np.nan] with self.assertRaises(AssertionError): validate_no_null_columns_or_rows(self.dataframe) def test_null_column(self): - self.dataframe[hgvs_pro_column][0] = np.nan + self.dataframe[required_score_column][0] = np.nan with self.assertRaises(AssertionError): validate_no_null_columns_or_rows(self.dataframe) From 2a3ef7da40d713f25e76a6a70ab4744944e5be77 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:06:34 -0800 Subject: [PATCH 867/877] update conditional to accurately validate column names --- mavecore/validation/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/validation/dataframe.py b/mavecore/validation/dataframe.py index 8e4e996..c6fe168 100644 --- a/mavecore/validation/dataframe.py +++ b/mavecore/validation/dataframe.py @@ -116,7 +116,7 @@ def validate_column_names(dataframe, scores=True): # there should not be any null columns # check for empty strings, np.nan, and None # if is_null(columns[i]) or columns[i] is None: - if isinstance(columns[i], str) or columns[i] == "" or columns[i].isspace(): + if not isinstance(columns[i], str) or columns[i] == "" or columns[i].isspace(): # above condition will check that value is not None or np.nan also raise ValidationError("Column names must not be null.") # in readable_null_values_list: if columns[i] in [hgvs_nt_column, hgvs_pro_column, hgvs_splice_column]: From 3320c6c4db0e1d7c23303ef26aff537d1ab852e8 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:11:07 -0800 Subject: [PATCH 868/877] update valid experimentset urn test and mark variant urn test as TODO --- tests/validation/urn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/validation/urn.py b/tests/validation/urn.py index ac2b33f..6f86eb1 100644 --- a/tests/validation/urn.py +++ b/tests/validation/urn.py @@ -12,7 +12,7 @@ def test_invalid_mavedb_urn(self): validate_mavedb_urn("urn:mavedb:00000002-a-1-z") def test_valid_mavedb_urn_experimentset(self): - validate_mavedb_urn_experimentset("") + validate_mavedb_urn_experimentset("urn:mavedb:00000001") def test_invalid_mavedb_urn_experimentset(self): with self.assertRaises(ValidationError): @@ -33,7 +33,9 @@ def test_invalid_mavedb_urn_scoreset(self): validate_mavedb_urn_scoreset("") def test_valid_mavedb_urn_variant(self): - validate_mavedb_urn_variant("") + # TODO find a valid variant urn + pass + #validate_mavedb_urn_variant("") def test_invalid_mavedb_urn_variant(self): with self.assertRaises(ValidationError): From b9f33ef6b6b95a145eb30235dd7007eb7a860eb4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:00:38 -0800 Subject: [PATCH 869/877] add camelCase conversion utility function --- mavecore/validation/utilities.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mavecore/validation/utilities.py b/mavecore/validation/utilities.py index 252631c..2291e90 100644 --- a/mavecore/validation/utilities.py +++ b/mavecore/validation/utilities.py @@ -8,6 +8,12 @@ #from mavecore.validation.variant import validate_hgvs_string +def to_camel(string: str) -> str: + camel = ''.join(word.capitalize() for word in string.split('_')) + camel = camel[0].lower() + camel[1:] + return camel + + def is_null(value): """ Returns True if a stripped/lowercase value in in `nan_col_values`. From 2dbe10dd693ced46b308142653f37926bd11eaf4 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:01:28 -0800 Subject: [PATCH 870/877] import camelCase utilty function to pydantic model files --- mavecore/models/data.py | 1 + mavecore/models/identifier.py | 1 + mavecore/models/map.py | 2 ++ mavecore/models/sequence.py | 1 + mavecore/models/target.py | 1 + 5 files changed, 6 insertions(+) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 3228598..4698ce9 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -5,6 +5,7 @@ from .target import TargetGene from mavecore.validation import keywords, urn +from mavecore.validation.utilities import to_camel class DataSet(BaseModel): diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 4b91c86..339c981 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -2,6 +2,7 @@ from typing import Optional from mavecore.validation import identifier as id +from mavecore.validation.utilities import to_camel class Identifier(BaseModel): diff --git a/mavecore/models/map.py b/mavecore/models/map.py index 4b50a32..ca4a012 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -1,5 +1,7 @@ from pydantic import BaseModel +from mavecore.validation.utilities import to_camel + class ReferenceMap(BaseModel): genomeId: int diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index 3ab5c04..ffae1c1 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, validator from mavecore.validation import target +from mavecore.validation.utilities import to_camel class WildType(BaseModel): diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 77e1a18..5f9fd26 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -6,6 +6,7 @@ from mavecore.validation import target from mavecore.models.identifier import ExternalIdentifier +from mavecore.validation.utilities import to_camel class TargetGene(BaseModel): From 66a99a49aad755a2862a3435ee0e2cb7d762179b Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:01:48 -0800 Subject: [PATCH 871/877] add pydantic to requirements file --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index d535553..ae0e580 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ pre-commit coverage +pydantic From cce4ff88586af48e42670401de403ae483e4d97c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:02:34 -0800 Subject: [PATCH 872/877] change model attributes to snake_case --- mavecore/models/data.py | 28 ++++++++++++++-------------- mavecore/models/map.py | 7 +++++-- mavecore/models/sequence.py | 2 +- mavecore/models/target.py | 4 ++-- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 4698ce9..0fef202 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -10,10 +10,10 @@ class DataSet(BaseModel): title: str - shortDescription: str - abstractText: str - methodText: str - extraMetadata: Optional[Dict] + short_description: str + abstract_text: str + method_text: str + extra_metadata: Optional[Dict] keywords: Optional[List[str]] @validator('keywords') @@ -22,19 +22,19 @@ def validate_keywords(cls, v): class Experiment(DataSet): - doiIdentifiers: Optional[List[DoiIdentifier]] - pubmedIdentifiers: Optional[List[PubmedIdentifier]] + doi_identifiers: Optional[List[DoiIdentifier]] + pubmed_identifiers: Optional[List[PubmedIdentifier]] class ScoreSet(DataSet): - dataUsagePolicy: str - licenceId: int - experimentUrn: str - supersededScoresetUrn: Optional[str] - metaAnalysisSourceScoresetUrns: Optional[List[str]] - doiIdentifiers: Optional[List[DoiIdentifier]] - pubmedIdentifiers: Optional[List[PubmedIdentifier]] - targetGene: TargetGene + data_usage_policy: str + licence_id: int + experiment_urn: str + superseded_scoreset_urn: Optional[str] + meta_analysis_source_scoreset_urns: Optional[List[str]] + doi_identifiers: Optional[List[DoiIdentifier]] + pubmed_identifiers: Optional[List[PubmedIdentifier]] + target_gene: TargetGene @validator('supersededScoresetUrn', 'metaAnalysisSourceScoresetUrns') def validate_scoreset_urn(cls, v): diff --git a/mavecore/models/map.py b/mavecore/models/map.py index ca4a012..bbad61a 100644 --- a/mavecore/models/map.py +++ b/mavecore/models/map.py @@ -4,5 +4,8 @@ class ReferenceMap(BaseModel): - genomeId: int - targetId: int + genome_id: int + target_id: int + + class Config: + alias_generator = to_camel diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index ffae1c1..ad454db 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -5,7 +5,7 @@ class WildType(BaseModel): - sequenceType: str + sequence_type: str sequence: str @validator('sequenceType') diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 5f9fd26..193978f 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -12,8 +12,8 @@ class TargetGene(BaseModel): name: str category: str - externalIdentifiers: List[ExternalIdentifier] - referenceMaps: List[ReferenceMap] + external_identifiers: List[ExternalIdentifier] + reference_maps: List[ReferenceMap] wtSequence: WildType @validator('category') From 761e067918abda859cb16cbbb60e2c59d674ed1c Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:03:29 -0800 Subject: [PATCH 873/877] add generator to pydantic models to create camelCase alias' to the snake_case attributes --- mavecore/models/data.py | 3 +++ mavecore/models/identifier.py | 3 +++ mavecore/models/sequence.py | 5 ++++- mavecore/models/target.py | 3 +++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 0fef202..a2b3467 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -16,6 +16,9 @@ class DataSet(BaseModel): extra_metadata: Optional[Dict] keywords: Optional[List[str]] + class Config: + alias_generator = to_camel + @validator('keywords') def validate_keywords(cls, v): keywords.validate_keywords(v) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index 339c981..a4db631 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -53,6 +53,9 @@ class ExternalIdentifier(BaseModel): identifier: dict offset: Optional[int] + class Config: + alias_generator = to_camel + @validator('identifier') def validate_identifier(cls, v): id.validate_external_identifier(v) diff --git a/mavecore/models/sequence.py b/mavecore/models/sequence.py index ad454db..7c2399e 100644 --- a/mavecore/models/sequence.py +++ b/mavecore/models/sequence.py @@ -8,7 +8,10 @@ class WildType(BaseModel): sequence_type: str sequence: str - @validator('sequenceType') + class Config: + alias_generator = to_camel + + @validator('sequence_type') def validate_category(cls, v): target.validate_sequence_category(v) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index 193978f..aa579a8 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -16,6 +16,9 @@ class TargetGene(BaseModel): reference_maps: List[ReferenceMap] wtSequence: WildType + class Config: + alias_generator = to_camel + @validator('category') def validate_category(cls, v): target.validate_target_category(v) From b7276bc2233e1c255bb489ab9fd06fc820ede8ab Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:03:34 -0800 Subject: [PATCH 874/877] add generator to pydantic models to create camelCase alias' to the snake_case attributes --- mavecore/models/identifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mavecore/models/identifier.py b/mavecore/models/identifier.py index a4db631..13c748e 100644 --- a/mavecore/models/identifier.py +++ b/mavecore/models/identifier.py @@ -8,6 +8,9 @@ class Identifier(BaseModel): identifier: str + class Config: + alias_generator = to_camel + class DoiIdentifier(Identifier): From 29d366e89ac5eb7d69d311c3c48be419a52ab4e6 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:03:50 -0800 Subject: [PATCH 875/877] update decorator arguments to snake_case --- mavecore/models/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index a2b3467..2e922f5 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -39,13 +39,13 @@ class ScoreSet(DataSet): pubmed_identifiers: Optional[List[PubmedIdentifier]] target_gene: TargetGene - @validator('supersededScoresetUrn', 'metaAnalysisSourceScoresetUrns') + @validator('superseded_scoresetUrn', 'meta_analysis_source_scoreset_urns') def validate_scoreset_urn(cls, v): if type(v) == str: urn.validate_mavedb_urn_scoreset(v) else: [urn.validate_mavedb_urn_scoreset(s) for s in v] - @validator('experimentUrn') + @validator('experiment_urn') def validate_experiment_urn(cls, v): urn.validate_mavedb_urn_experiment(v) From ffa73fcaabd22fbf67ca1c873864ac79e2125a6d Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:04:28 -0800 Subject: [PATCH 876/877] update decorator arguments to snake_case --- mavecore/models/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/data.py b/mavecore/models/data.py index 2e922f5..b0d1800 100644 --- a/mavecore/models/data.py +++ b/mavecore/models/data.py @@ -39,7 +39,7 @@ class ScoreSet(DataSet): pubmed_identifiers: Optional[List[PubmedIdentifier]] target_gene: TargetGene - @validator('superseded_scoresetUrn', 'meta_analysis_source_scoreset_urns') + @validator('superseded_scoreset_urn', 'meta_analysis_source_scoreset_urns') def validate_scoreset_urn(cls, v): if type(v) == str: urn.validate_mavedb_urn_scoreset(v) From 324ce77a84a29b8e2336cf4b08c9bdec168cdc26 Mon Sep 17 00:00:00 2001 From: harmatt <79935163+harmatt@users.noreply.github.com> Date: Mon, 28 Nov 2022 18:06:36 -0800 Subject: [PATCH 877/877] update pydantic attribute to snake_case --- mavecore/models/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mavecore/models/target.py b/mavecore/models/target.py index aa579a8..d246337 100644 --- a/mavecore/models/target.py +++ b/mavecore/models/target.py @@ -14,7 +14,7 @@ class TargetGene(BaseModel): category: str external_identifiers: List[ExternalIdentifier] reference_maps: List[ReferenceMap] - wtSequence: WildType + wt_sequence: WildType class Config: alias_generator = to_camel