Skip to content

Commit d1315b5

Browse files
authored
Merge pull request #167 from MIT-LCP/tp/validator
Add basic dataset validator. Ref #165
2 parents eb846c8 + 2943e3f commit d1315b5

File tree

18 files changed

+2337
-2
lines changed

18 files changed

+2337
-2
lines changed

physionet/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1-
from .api import PhysioNetClient
1+
from physionet.api import PhysioNetClient
2+
3+
try:
4+
from importlib.metadata import version
5+
__version__ = version("physionet")
6+
except Exception:
7+
__version__ = "unknown"
28

39
__all__ = ["PhysioNetClient"]

physionet/__main__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Allow running the CLI as a module: python -m physionet."""
2+
3+
import sys
4+
from physionet.cli import main
5+
6+
if __name__ == "__main__":
7+
sys.exit(main())

physionet/cli.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""Command-line interface for physionet package."""
2+
3+
import argparse
4+
import json
5+
import sys
6+
from pathlib import Path
7+
8+
from physionet.validate import validate_dataset, ValidationConfig
9+
10+
11+
def main():
12+
"""Main entry point for the CLI."""
13+
parser = argparse.ArgumentParser(
14+
prog="physionet",
15+
description="Tools for working with PhysioNet datasets",
16+
)
17+
18+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
19+
20+
# Validate subcommand
21+
validate_parser = subparsers.add_parser(
22+
"validate",
23+
help="Validate a dataset before submission to PhysioNet",
24+
)
25+
validate_parser.add_argument(
26+
"path",
27+
help="Path to the dataset directory to validate",
28+
)
29+
validate_parser.add_argument(
30+
"--report",
31+
metavar="FILE",
32+
help="Generate detailed JSON report and save to FILE",
33+
)
34+
validate_parser.add_argument(
35+
"--checks",
36+
metavar="CATEGORIES",
37+
help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)",
38+
)
39+
validate_parser.add_argument(
40+
"--level",
41+
choices=["error", "warning", "info"],
42+
default="info",
43+
help="Minimum severity level to display (default: info)",
44+
)
45+
validate_parser.add_argument(
46+
"--no-sampling",
47+
action="store_true",
48+
help="Disable sampling for large files (scan all rows, slower but more thorough)",
49+
)
50+
validate_parser.add_argument(
51+
"--max-rows",
52+
type=int,
53+
metavar="N",
54+
help="Maximum rows to scan per CSV file (default: 10000)",
55+
)
56+
57+
args = parser.parse_args()
58+
59+
if args.command == "validate":
60+
return _handle_validate(args)
61+
elif args.command is None:
62+
parser.print_help()
63+
return 0
64+
else:
65+
print(f"Unknown command: {args.command}", file=sys.stderr)
66+
return 1
67+
68+
69+
def _handle_validate(args):
70+
"""Handle the validate subcommand."""
71+
# Validate path
72+
dataset_path = Path(args.path)
73+
if not dataset_path.exists():
74+
print(f"Error: Path does not exist: {args.path}", file=sys.stderr)
75+
return 1
76+
77+
if not dataset_path.is_dir():
78+
print(f"Error: Path is not a directory: {args.path}", file=sys.stderr)
79+
return 1
80+
81+
# Configure validation
82+
config = ValidationConfig()
83+
84+
# Parse check categories if specified
85+
if args.checks:
86+
categories = [c.strip().lower() for c in args.checks.split(",")]
87+
config.check_filesystem = "filesystem" in categories
88+
config.check_documentation = "documentation" in categories
89+
config.check_integrity = "integrity" in categories
90+
config.check_quality = "quality" in categories
91+
config.check_phi = "privacy" in categories
92+
93+
# Configure sampling options
94+
if args.no_sampling:
95+
config.sample_large_files = False
96+
if args.max_rows:
97+
config.max_rows_to_scan = args.max_rows
98+
99+
# Run validation
100+
try:
101+
print(f"Validating dataset: {dataset_path}")
102+
result = validate_dataset(str(dataset_path), config, show_progress=True)
103+
print()
104+
105+
print(result.summary())
106+
107+
# Save validation report - either to specified path or default location
108+
if args.report:
109+
report_path = Path(args.report)
110+
# Determine format based on file extension
111+
if report_path.suffix.lower() == '.json':
112+
# Save as JSON
113+
with open(report_path, "w", encoding="utf-8") as f:
114+
json.dump(result.to_dict(), f, indent=2)
115+
else:
116+
# Save as Markdown
117+
with open(report_path, "w", encoding="utf-8") as f:
118+
f.write(result.summary())
119+
else:
120+
# Default: save as Markdown in the root of the dataset folder
121+
report_path = dataset_path / "PHYSIONET_REPORT.md"
122+
with open(report_path, "w", encoding="utf-8") as f:
123+
f.write(result.summary())
124+
125+
print()
126+
print(f"Validation report saved to: {report_path}")
127+
128+
if result.status == "error":
129+
return 1
130+
elif result.status == "warning" and args.level == "error":
131+
return 0 # Warnings don't fail if level is error
132+
return 0
133+
134+
except Exception as e:
135+
print(f"Error during validation: {str(e)}", file=sys.stderr)
136+
import traceback
137+
traceback.print_exc()
138+
return 1
139+
140+
141+
if __name__ == "__main__":
142+
sys.exit(main())

physionet/validate/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Dataset validation module for PhysioNet submissions."""
2+
3+
from physionet.validate.validator import validate_dataset
4+
from physionet.validate.config import ValidationConfig
5+
from physionet.validate.models import ValidationResult
6+
7+
__all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"]
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Validation check modules."""
2+
3+
from physionet.validate.checks.filesystem import check_filesystem
4+
from physionet.validate.checks.documentation import check_documentation
5+
from physionet.validate.checks.integrity import check_integrity
6+
from physionet.validate.checks.quality import check_quality
7+
from physionet.validate.checks.privacy import check_privacy
8+
9+
__all__ = [
10+
"check_filesystem",
11+
"check_documentation",
12+
"check_integrity",
13+
"check_quality",
14+
"check_privacy",
15+
]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Documentation validation checks."""
2+
3+
from pathlib import Path
4+
5+
from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
6+
from physionet.validate.config import ValidationConfig
7+
8+
9+
def check_documentation(path: Path, config: ValidationConfig) -> CheckResult:
10+
"""
11+
Check documentation completeness.
12+
13+
Validates:
14+
- Required files exist (if any are specified in config)
15+
16+
Args:
17+
path: Path to dataset directory
18+
config: Validation configuration
19+
20+
Returns:
21+
CheckResult with any documentation issues found
22+
"""
23+
result = CheckResult(category=CheckCategory.DOCUMENTATION)
24+
25+
# Check for required files
26+
for required_file in config.required_files:
27+
file_path = path / required_file
28+
if not file_path.exists():
29+
# Customize suggestion for README.md
30+
if required_file == "README.md":
31+
suggestion = (
32+
"Add README.md to your dataset. At minimum, the file should include "
33+
"a title and a brief description of the package content."
34+
)
35+
else:
36+
suggestion = f"Add {required_file} to your dataset"
37+
38+
result.issues.append(
39+
ValidationIssue(
40+
severity=Severity.ERROR,
41+
category=CheckCategory.DOCUMENTATION,
42+
file=required_file,
43+
message=f"Required file not found: {required_file}",
44+
suggestion=suggestion,
45+
)
46+
)
47+
48+
return result

0 commit comments

Comments
 (0)