Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/data_providers/data_format/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ formatting, but the available types are:
a fixed set of groups.
* **Ordered Categorical**: a factor where there is a logical order to the levels.
* **Numeric**: all kinds of numeric data.
* **Logical**: for logical (i.e. True vs False) data.
* **Taxa**: what taxa was the data collected from?
* **Abundance**: for abundance/density/presence data collected about a taxon.
* **Categorical Trait**: for categorical data collected on a taxon.
Expand Down Expand Up @@ -281,6 +282,20 @@ example. If this is the case, enter None rather than leaving the descriptors bla
you prefer to use Dimensionless as the unit for dimensionless quantities then that is
also fine!)

### Logical data

This field type should be used to record data that is either True or False. Data of this
type could alternatively be captured as categorical data with "True" and "False" being
used as levels. However, using this field type should make subsequent analyses similar
as it ensures that the data remains stored as booleans.

We would recommend this field type for situations where only two unambiguous outcomes
are possible (e.g whether a trap had been triggered when you returned to a plot). In
situations of greater ambiguity, such as describing the logging status of a forest, we
would recommend using the categorical data field type instead, as this allows you to
express multiple levels (i.e. unlogged, lightly logged vs heavily logged, rather than
just logged vs unlogged).

### Abundance and trait data

Both traits and abundance data tie a value (category or number) to a single taxon. You
Expand Down
1 change: 1 addition & 0 deletions docs/developers/api/field.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- FileField
- GeoField
- LocationsField
- LogicalField
- NumericField
- NumericInteractionField
- NumericTaxaField
Expand Down
25 changes: 25 additions & 0 deletions safedata_validator/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from safedata_validator.validators import (
RE_DMS,
HasDuplicates,
IsBool,
IsInSet,
IsLocName,
IsNotBlank,
Expand Down Expand Up @@ -1573,6 +1574,30 @@ def validate_data(self, data: list) -> None:
self._log("Cells contain non-numeric values")


class LogicalField(BaseField):
"""A BaseField subclass for logical (true vs false) fields.

Extends [BaseField][safedata_validator.field.BaseField] to validate logical data
fields.
"""

field_types: tuple[str, ...] = ("logical",)

def validate_data(self, data: list) -> None:
"""Validate logical field data.

Runs the BaseField
[run_common_validation][safedata_validator.field.BaseField.run_common_validation]
method and also ensures that data values are logical (i.e. booleans).
"""
data = self.run_common_validation(data)

boolean = IsBool(data)

if not boolean:
self._log("Cells contain non-boolean values")


class CategoricalField(BaseField):
"""A BaseField subclass for categorical and ordered categorical fields.

Expand Down
19 changes: 19 additions & 0 deletions safedata_validator/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,25 @@ def rfunc(val) -> Any:
return val


class IsBool(Filter):
"""A Filter subclass for boolean values.

The `tfunc` method overrides the base
[tfunc][safedata_validator.validators.Filter.tfunc] method to check if values are a
bool. Failing values are kept unchanged in the instance values.
"""

@staticmethod
def tfunc(val) -> bool:
"""Test for float or int values."""
return isinstance(val, bool)

@staticmethod
def rfunc(val) -> Any:
"""Return failing values unchanged."""
return val


class IsNotNumericString(Filter):
"""A Filter subclass to trap numeric strings.

Expand Down
Binary file modified test/fixtures/Test_format_bad.xlsx
Binary file not shown.
Binary file modified test/fixtures/Test_format_good.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"file_key, n_errors, n_taxa",
[
("good_excel_file", 0, 20),
("bad_excel_file", 94, 13),
("bad_excel_file", 95, 13),
("good_seq_taxa_file", 0, 48),
("bad_seq_taxa_file", 6, 34),
],
Expand Down
2 changes: 1 addition & 1 deletion test/test_dataworksheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ def test_DataWorksheet_load_from_worksheet(
[
("good", "DF", 0),
("good", "Incidence", 0),
("bad", "DF", 41),
("bad", "DF", 42),
("bad", "Incidence", 8),
],
indirect=["example_excel_files"], # take actual params from fixture
Expand Down
68 changes: 68 additions & 0 deletions test/test_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,74 @@ def test_NumericField_and_subclasses_validate_data(
)


@pytest.mark.parametrize(
"data, expected_log",
[
(
[True, False, False, False, True, True, True, False, False],
((INFO, "Checking field trap_empty"),),
),
(
[True, "NA", False, False, True, True, "NA", False, False],
((INFO, "Checking field trap_empty"), (WARNING, "2 / 9 values missing")),
),
(
[True, None, False, False, True, True, " ", False, False],
(
(INFO, "Checking field trap_empty"),
(ERROR, "2 cells are blank or contain only whitespace text"),
),
),
(
[True, False, False, False, "#REF!", True, "#N/A", False, False],
(
(INFO, "Checking field trap_empty"),
(ERROR, "2 cells contain Excel formula errors"),
),
),
(
[True, False, False, False, "wrong_type", True, True, False, False],
(
(INFO, "Checking field trap_empty"),
(ERROR, "Cells contain non-boolean values"),
),
),
(
[True, False, "NA", False, "wrong_type", True, None, False, False],
(
(INFO, "Checking field trap_empty"),
(ERROR, "Cells contain non-boolean values"),
(WARNING, "1 / 9 values missing"),
(ERROR, "1 cells are blank or contain only whitespace text"),
),
),
],
)
def test_LogicalField_validate_data(caplog, fixture_dataset, data, expected_log):
"""Testing behaviour of the LogicalField in using _validate_data."""
from safedata_validator.field import LogicalField

# Create an instance of the LogicalField
field_meta = {
"field_type": "logical",
"description": "Was small mammal trap empty?",
"field_name": "trap_empty",
}
fld = LogicalField(field_meta, dataset=fixture_dataset)

fld.validate_data(data)
fld.report()

assert len(expected_log) == len(caplog.records)

assert all(
[exp[0] == rec.levelno for exp, rec in zip(expected_log, caplog.records)]
)
assert all(
[exp[1] in rec.message for exp, rec in zip(expected_log, caplog.records)]
)


# CategoricalField and derived classes
# - Can reuse the same data to also check taxon and interaction classes
# inheriting from CategoricalField for validate_data.
Expand Down