Wrap gmtselect (#1429)

weiji14 · willschlitzer · web-flow · commit bc140cb3b723 · 2021-10-29T05:34:13.000+01:00
* Wrap  the gmtselect function which selects data table subsets based on multiple spatial
criteria. Aliased non-common optional parameters reverse (I) and
z_subregion (Z), area_thresh (A), resolution (D), gridmask (G) and mask (N).
*Add tests for select
*Add imports for select

Co-authored-by: Will Schlitzer &lt;schlitzer90@gmail.com&gt;
diff --git a/doc/api/index.rst b/doc/api/index.rst
@@ -84,6 +84,7 @@ Operations on tabular data:
     blockmode
     nearneighbor
     project
+    select
     sph2grd
     sphdistance
     sphinterpolate
diff --git a/pygmt/__init__.py b/pygmt/__init__.py
@@ -51,6 +51,7 @@
     makecpt,
     nearneighbor,
     project,
+    select,
     sph2grd,
     sphdistance,
     sphinterpolate,
diff --git a/pygmt/src/__init__.py b/pygmt/src/__init__.py
@@ -38,6 +38,7 @@
 from pygmt.src.plot3d import plot3d
 from pygmt.src.project import project
 from pygmt.src.rose import rose
+from pygmt.src.select import select
 from pygmt.src.solar import solar
 from pygmt.src.sph2grd import sph2grd
 from pygmt.src.sphdistance import sphdistance
diff --git a/pygmt/src/select.py b/pygmt/src/select.py
@@ -0,0 +1,172 @@
+"""
+select - Select data table subsets based on multiple spatial criteria.
+"""
+import pandas as pd
+from pygmt.clib import Session
+from pygmt.helpers import (
+    GMTTempFile,
+    build_arg_string,
+    fmt_docstring,
+    kwargs_to_strings,
+    use_alias,
+)
+
+
+@fmt_docstring
+@use_alias(
+    A="area_thresh",
+    D="resolution",
+    G="gridmask",
+    I="reverse",
+    J="projection",
+    N="mask",
+    R="region",
+    V="verbose",
+    Z="z_subregion",
+    b="binary",
+    d="nodata",
+    e="find",
+    f="coltypes",
+    g="gap",
+    h="header",
+    i="incols",
+    o="outcols",
+    r="registration",
+    s="skiprows",
+    w="wrap",
+)
+@kwargs_to_strings(M="sequence", R="sequence", i="sequence_comma", o="sequence_comma")
+def select(data=None, outfile=None, **kwargs):
+    r"""
+    Select data table subsets based on multiple spatial criteria.
+
+    This is a filter that reads (x, y) or (longitude, latitude) positions from
+    the first 2 columns of *data* and uses a combination of 1-7 criteria to
+    pass or reject the records. Records can be selected based on whether or not
+    they are:
+
+    1. inside a rectangular region (**region** [and **projection**])
+    2. within *dist* km of any point in *pointfile*
+    3. within *dist* km of any line in *linefile*
+    4. inside one of the polygons in the *polygonfile*
+    5. inside geographical features (based on coastlines)
+    6. has z-values within a given range, or
+    7. inside bins of a grid mask whose nodes are non-zero
+
+    The sense of the tests can be reversed for each of these 7 criteria by
+    using the **reverse** option.
+
+    Full option list at :gmt-docs:`gmtselect.html`
+
+    {aliases}
+
+    Parameters
+    ----------
+    data : str or {table-like}
+        Pass in either a file name to an ASCII data table, a 2D
+        {table-classes}.
+    outfile : str
+        The file name for the output ASCII file.
+    {A}
+    resolution : str
+        *resolution*\ [**+f**].
+        Ignored unless **mask** is set. Selects the resolution of the coastline
+        data set to use ((**f**)ull, (**h**)igh, (**i**)ntermediate, (**l**)ow,
+        or (**c**)rude). The resolution drops off by ~80% between data sets.
+        [Default is **l**]. Append (**+f**) to automatically select a lower
+        resolution should the one requested not be available [Default is abort
+        if not found]. Note that because the coastlines differ in details it is
+        not guaranteed that a point will remain inside [or outside] when a
+        different resolution is selected.
+    gridmask : str
+        Pass all locations that are inside the valid data area of the grid
+        *gridmask*. Nodes that are outside are either NaN or zero.
+    reverse : str
+        [**cflrsz**].
+        Reverses the sense of the test for each of the criteria specified:
+
+        - **c** select records NOT inside any point's circle of influence.
+        - **f** select records NOT inside any of the polygons.
+        - **g** will pass records inside the cells with z equal zero of the
+          grid mask in **gridmask**.
+        - **l** select records NOT within the specified distance of any line.
+        - **r** select records NOT inside the specified rectangular region.
+        - **s** select records NOT considered inside as specified by **mask**
+          (and **area_thresh**, **resolution**).
+        - **z** select records NOT within the range specified by
+          **z_subregion**.
+    {J}
+    mask : str or list
+        Pass all records whose location is inside specified geographical
+        features. Specify if records should be skipped (s) or kept (k) using
+        1 of 2 formats:
+
+        - *wet/dry*.
+        - *ocean/land/lake/island/pond*.
+
+        [Default is s/k/s/k/s (i.e., s/k), which passes all points on dry
+        land].
+    {R}
+    {V}
+    z_subregion : str
+        *min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**].
+        Pass all records whose 3rd column (*z*; *col* = 2) lies within the
+        given range or is NaN (use **skiprows** to skip NaN records). If *max*
+        is omitted then we test if *z* equals *min* instead. This means
+        equality within 5 ULPs (unit of least precision;
+        http://en.wikipedia.org/wiki/Unit_in_the_last_place). Input file must
+        have at least three columns. To indicate no limit on min or max,
+        specify a hyphen (-). If your 3rd column is absolute time then remember
+        to supply ``coltypes="2T"``. To specify another column, append
+        **+c**\ *col*, and to specify several tests just repeat the
+        **z_subregion** option as many times as you have columns to test.
+        **Note**: When more than one **z_subregion** option is given then the
+        ``reverse="z"`` option cannot be used. In the case of multiple tests
+        you may use these modifiers as well: **+a** passes any record that
+        passes at least one of your *z* tests [Default is all tests must pass],
+        and **+i** reverses the tests to pass record with *z* value NOT in the
+        given range. Finally, if **+c** is not used then it is automatically
+        incremented for each new **z_subregion** option, starting with 2.
+    {b}
+    {d}
+    {e}
+    {f}
+    {g}
+    {h}
+    {i}
+    {o}
+    {r}
+    {s}
+    {w}
+
+    Returns
+    -------
+    output : pandas.DataFrame or None
+        Return type depends on whether the ``outfile`` parameter is set:
+
+        - :class:`pandas.DataFrame` table if ``outfile`` is not set.
+        - None if ``outfile`` is set (filtered output will be stored in file
+          set by ``outfile``).
+    """
+
+    with GMTTempFile(suffix=".csv") as tmpfile:
+        with Session() as lib:
+            # Choose how data will be passed into the module
+            table_context = lib.virtualfile_from_data(check_kind="vector", data=data)
+            with table_context as infile:
+                if outfile is None:
+                    outfile = tmpfile.name
+                arg_str = " ".join([infile, build_arg_string(kwargs), "->" + outfile])
+                lib.call_module(module="gmtselect", args=arg_str)
+
+        # Read temporary csv output to a pandas table
+        if outfile == tmpfile.name:  # if user did not set outfile, return pd.DataFrame
+            try:
+                column_names = data.columns.to_list()
+                result = pd.read_csv(tmpfile.name, sep="\t", names=column_names)
+            except AttributeError:  # 'str' object has no attribute 'columns'
+                result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">")
+        elif outfile != tmpfile.name:  # return None if outfile set, output in outfile
+            result = None
+
+    return result
diff --git a/pygmt/tests/test_select.py b/pygmt/tests/test_select.py
@@ -0,0 +1,64 @@
+"""
+Tests for select.
+"""
+import os
+
+import numpy.testing as npt
+import pandas as pd
+import pytest
+from pygmt import select
+from pygmt.datasets import load_sample_bathymetry
+from pygmt.helpers import GMTTempFile
+
+
+@pytest.fixture(scope="module", name="dataframe")
+def fixture_dataframe():
+    """
+    Load the table data from the sample bathymetry dataset.
+    """
+    return load_sample_bathymetry()
+
+
+def test_select_input_dataframe(dataframe):
+    """
+    Run select by passing in a pandas.DataFrame as input.
+    """
+    output = select(data=dataframe, region=[250, 251, 26, 27])
+    assert isinstance(output, pd.DataFrame)
+    assert all(dataframe.columns == output.columns)
+    assert output.shape == (65, 3)
+    npt.assert_allclose(output.median(), [250.31464, 26.33893, -270.0])
+
+
+def test_select_input_table_matrix(dataframe):
+    """
+    Run select using table input that is not a pandas.DataFrame but still a
+    matrix.
+
+    Also testing the reverse (I) alias.
+    """
+    data = dataframe.values
+    output = select(data=data, region=[245.5, 254.5, 20.5, 29.5], reverse="r")
+    assert isinstance(output, pd.DataFrame)
+    assert output.shape == (9177, 3)
+    npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0])
+
+
+def test_select_input_filename():
+    """
+    Run select by passing in an ASCII text file as input.
+
+    Also testing the z_subregion (Z) alias.
+    """
+    with GMTTempFile() as tmpfile:
+        output = select(
+            data="@tut_ship.xyz",
+            region=[250, 251, 26, 27],
+            z_subregion=["-/-630", "-120/0+a"],
+            outfile=tmpfile.name,
+        )
+        assert output is None  # check that output is None since outfile is set
+        assert os.path.exists(path=tmpfile.name)
+        output = pd.read_csv(tmpfile.name, sep="\t", header=None)
+        assert output.shape == (5, 3)
+        npt.assert_allclose(output.median(), [250.12149, 26.04296, -674.0])