Skip to content

Commit bc140cb

Browse files
Wrap gmtselect (#1429)
* Wrap the gmtselect function which selects data table subsets based on multiple spatial criteria. Aliased non-common optional parameters reverse (I) and z_subregion (Z), area_thresh (A), resolution (D), gridmask (G) and mask (N). *Add tests for select *Add imports for select Co-authored-by: Will Schlitzer <[email protected]>
1 parent d12470f commit bc140cb

File tree

5 files changed

+239
-0
lines changed

5 files changed

+239
-0
lines changed

doc/api/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ Operations on tabular data:
8484
blockmode
8585
nearneighbor
8686
project
87+
select
8788
sph2grd
8889
sphdistance
8990
sphinterpolate

pygmt/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
makecpt,
5252
nearneighbor,
5353
project,
54+
select,
5455
sph2grd,
5556
sphdistance,
5657
sphinterpolate,

pygmt/src/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from pygmt.src.plot3d import plot3d
3939
from pygmt.src.project import project
4040
from pygmt.src.rose import rose
41+
from pygmt.src.select import select
4142
from pygmt.src.solar import solar
4243
from pygmt.src.sph2grd import sph2grd
4344
from pygmt.src.sphdistance import sphdistance

pygmt/src/select.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
"""
2+
select - Select data table subsets based on multiple spatial criteria.
3+
"""
4+
import pandas as pd
5+
from pygmt.clib import Session
6+
from pygmt.helpers import (
7+
GMTTempFile,
8+
build_arg_string,
9+
fmt_docstring,
10+
kwargs_to_strings,
11+
use_alias,
12+
)
13+
14+
15+
@fmt_docstring
16+
@use_alias(
17+
A="area_thresh",
18+
D="resolution",
19+
G="gridmask",
20+
I="reverse",
21+
J="projection",
22+
N="mask",
23+
R="region",
24+
V="verbose",
25+
Z="z_subregion",
26+
b="binary",
27+
d="nodata",
28+
e="find",
29+
f="coltypes",
30+
g="gap",
31+
h="header",
32+
i="incols",
33+
o="outcols",
34+
r="registration",
35+
s="skiprows",
36+
w="wrap",
37+
)
38+
@kwargs_to_strings(M="sequence", R="sequence", i="sequence_comma", o="sequence_comma")
39+
def select(data=None, outfile=None, **kwargs):
40+
r"""
41+
Select data table subsets based on multiple spatial criteria.
42+
43+
This is a filter that reads (x, y) or (longitude, latitude) positions from
44+
the first 2 columns of *data* and uses a combination of 1-7 criteria to
45+
pass or reject the records. Records can be selected based on whether or not
46+
they are:
47+
48+
1. inside a rectangular region (**region** [and **projection**])
49+
2. within *dist* km of any point in *pointfile*
50+
3. within *dist* km of any line in *linefile*
51+
4. inside one of the polygons in the *polygonfile*
52+
5. inside geographical features (based on coastlines)
53+
6. has z-values within a given range, or
54+
7. inside bins of a grid mask whose nodes are non-zero
55+
56+
The sense of the tests can be reversed for each of these 7 criteria by
57+
using the **reverse** option.
58+
59+
Full option list at :gmt-docs:`gmtselect.html`
60+
61+
{aliases}
62+
63+
Parameters
64+
----------
65+
data : str or {table-like}
66+
Pass in either a file name to an ASCII data table, a 2D
67+
{table-classes}.
68+
outfile : str
69+
The file name for the output ASCII file.
70+
{A}
71+
resolution : str
72+
*resolution*\ [**+f**].
73+
Ignored unless **mask** is set. Selects the resolution of the coastline
74+
data set to use ((**f**)ull, (**h**)igh, (**i**)ntermediate, (**l**)ow,
75+
or (**c**)rude). The resolution drops off by ~80% between data sets.
76+
[Default is **l**]. Append (**+f**) to automatically select a lower
77+
resolution should the one requested not be available [Default is abort
78+
if not found]. Note that because the coastlines differ in details it is
79+
not guaranteed that a point will remain inside [or outside] when a
80+
different resolution is selected.
81+
gridmask : str
82+
Pass all locations that are inside the valid data area of the grid
83+
*gridmask*. Nodes that are outside are either NaN or zero.
84+
reverse : str
85+
[**cflrsz**].
86+
Reverses the sense of the test for each of the criteria specified:
87+
88+
- **c** select records NOT inside any point's circle of influence.
89+
- **f** select records NOT inside any of the polygons.
90+
- **g** will pass records inside the cells with z equal zero of the
91+
grid mask in **gridmask**.
92+
- **l** select records NOT within the specified distance of any line.
93+
- **r** select records NOT inside the specified rectangular region.
94+
- **s** select records NOT considered inside as specified by **mask**
95+
(and **area_thresh**, **resolution**).
96+
- **z** select records NOT within the range specified by
97+
**z_subregion**.
98+
{J}
99+
mask : str or list
100+
Pass all records whose location is inside specified geographical
101+
features. Specify if records should be skipped (s) or kept (k) using
102+
1 of 2 formats:
103+
104+
- *wet/dry*.
105+
- *ocean/land/lake/island/pond*.
106+
107+
[Default is s/k/s/k/s (i.e., s/k), which passes all points on dry
108+
land].
109+
{R}
110+
{V}
111+
z_subregion : str
112+
*min*\ [/*max*]\ [**+a**]\ [**+c**\ *col*]\ [**+i**].
113+
Pass all records whose 3rd column (*z*; *col* = 2) lies within the
114+
given range or is NaN (use **skiprows** to skip NaN records). If *max*
115+
is omitted then we test if *z* equals *min* instead. This means
116+
equality within 5 ULPs (unit of least precision;
117+
http://en.wikipedia.org/wiki/Unit_in_the_last_place). Input file must
118+
have at least three columns. To indicate no limit on min or max,
119+
specify a hyphen (-). If your 3rd column is absolute time then remember
120+
to supply ``coltypes="2T"``. To specify another column, append
121+
**+c**\ *col*, and to specify several tests just repeat the
122+
**z_subregion** option as many times as you have columns to test.
123+
**Note**: When more than one **z_subregion** option is given then the
124+
``reverse="z"`` option cannot be used. In the case of multiple tests
125+
you may use these modifiers as well: **+a** passes any record that
126+
passes at least one of your *z* tests [Default is all tests must pass],
127+
and **+i** reverses the tests to pass record with *z* value NOT in the
128+
given range. Finally, if **+c** is not used then it is automatically
129+
incremented for each new **z_subregion** option, starting with 2.
130+
{b}
131+
{d}
132+
{e}
133+
{f}
134+
{g}
135+
{h}
136+
{i}
137+
{o}
138+
{r}
139+
{s}
140+
{w}
141+
142+
Returns
143+
-------
144+
output : pandas.DataFrame or None
145+
Return type depends on whether the ``outfile`` parameter is set:
146+
147+
- :class:`pandas.DataFrame` table if ``outfile`` is not set.
148+
- None if ``outfile`` is set (filtered output will be stored in file
149+
set by ``outfile``).
150+
"""
151+
152+
with GMTTempFile(suffix=".csv") as tmpfile:
153+
with Session() as lib:
154+
# Choose how data will be passed into the module
155+
table_context = lib.virtualfile_from_data(check_kind="vector", data=data)
156+
with table_context as infile:
157+
if outfile is None:
158+
outfile = tmpfile.name
159+
arg_str = " ".join([infile, build_arg_string(kwargs), "->" + outfile])
160+
lib.call_module(module="gmtselect", args=arg_str)
161+
162+
# Read temporary csv output to a pandas table
163+
if outfile == tmpfile.name: # if user did not set outfile, return pd.DataFrame
164+
try:
165+
column_names = data.columns.to_list()
166+
result = pd.read_csv(tmpfile.name, sep="\t", names=column_names)
167+
except AttributeError: # 'str' object has no attribute 'columns'
168+
result = pd.read_csv(tmpfile.name, sep="\t", header=None, comment=">")
169+
elif outfile != tmpfile.name: # return None if outfile set, output in outfile
170+
result = None
171+
172+
return result

pygmt/tests/test_select.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Tests for select.
3+
"""
4+
import os
5+
6+
import numpy.testing as npt
7+
import pandas as pd
8+
import pytest
9+
from pygmt import select
10+
from pygmt.datasets import load_sample_bathymetry
11+
from pygmt.helpers import GMTTempFile
12+
13+
14+
@pytest.fixture(scope="module", name="dataframe")
15+
def fixture_dataframe():
16+
"""
17+
Load the table data from the sample bathymetry dataset.
18+
"""
19+
return load_sample_bathymetry()
20+
21+
22+
def test_select_input_dataframe(dataframe):
23+
"""
24+
Run select by passing in a pandas.DataFrame as input.
25+
"""
26+
output = select(data=dataframe, region=[250, 251, 26, 27])
27+
assert isinstance(output, pd.DataFrame)
28+
assert all(dataframe.columns == output.columns)
29+
assert output.shape == (65, 3)
30+
npt.assert_allclose(output.median(), [250.31464, 26.33893, -270.0])
31+
32+
33+
def test_select_input_table_matrix(dataframe):
34+
"""
35+
Run select using table input that is not a pandas.DataFrame but still a
36+
matrix.
37+
38+
Also testing the reverse (I) alias.
39+
"""
40+
data = dataframe.values
41+
output = select(data=data, region=[245.5, 254.5, 20.5, 29.5], reverse="r")
42+
assert isinstance(output, pd.DataFrame)
43+
assert output.shape == (9177, 3)
44+
npt.assert_allclose(output.median(), [247.235, 20.48624, -3241.0])
45+
46+
47+
def test_select_input_filename():
48+
"""
49+
Run select by passing in an ASCII text file as input.
50+
51+
Also testing the z_subregion (Z) alias.
52+
"""
53+
with GMTTempFile() as tmpfile:
54+
output = select(
55+
data="@tut_ship.xyz",
56+
region=[250, 251, 26, 27],
57+
z_subregion=["-/-630", "-120/0+a"],
58+
outfile=tmpfile.name,
59+
)
60+
assert output is None # check that output is None since outfile is set
61+
assert os.path.exists(path=tmpfile.name)
62+
output = pd.read_csv(tmpfile.name, sep="\t", header=None)
63+
assert output.shape == (5, 3)
64+
npt.assert_allclose(output.median(), [250.12149, 26.04296, -674.0])

0 commit comments

Comments
 (0)