Skip to content

Commit 4aaec70

Browse files
Change "spec" to schema, and minor tweak on format
1 parent 37eba31 commit 4aaec70

File tree

5 files changed

+49
-18
lines changed

5 files changed

+49
-18
lines changed

bio2zarr/cli.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,9 @@ def inspect(if_path, verbose):
5656

5757
@click.command
5858
@click.argument("if_path", type=click.Path())
59-
# @click.argument("specfile", type=click.Path())
60-
def genspec(if_path):
59+
def mkschema(if_path):
6160
stream = click.get_text_stream("stdout")
62-
vcf.generate_spec(if_path, stream)
61+
vcf.mkschema(if_path, stream)
6362

6463

6564
@click.command
@@ -105,7 +104,7 @@ def vcf2zarr():
105104

106105
vcf2zarr.add_command(explode)
107106
vcf2zarr.add_command(inspect)
108-
vcf2zarr.add_command(genspec)
107+
vcf2zarr.add_command(mkschema)
109108
vcf2zarr.add_command(to_zarr)
110109
vcf2zarr.add_command(convert_vcf)
111110
vcf2zarr.add_command(validate)

bio2zarr/vcf.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -887,7 +887,6 @@ def inspect(if_path):
887887

888888
@dataclasses.dataclass
889889
class ZarrColumnSpec:
890-
# TODO change to "variable_name"
891890
name: str
892891
dtype: str
893892
shape: tuple
@@ -898,6 +897,11 @@ class ZarrColumnSpec:
898897
compressor: dict
899898
# TODO add filters
900899

900+
def __post_init__(self):
901+
self.shape = tuple(self.shape)
902+
self.chunks = tuple(self.chunks)
903+
self.dimensions = tuple(self.dimensions)
904+
901905

902906
@dataclasses.dataclass
903907
class ZarrConversionSpec:
@@ -908,17 +912,24 @@ class ZarrConversionSpec:
908912
contig_id: list
909913
contig_length: list
910914
filter_id: list
911-
variables: list
915+
columns: dict
912916

913917
def asdict(self):
914918
return dataclasses.asdict(self)
915919

920+
def asjson(self):
921+
return json.dumps(self.asdict(), indent=4)
922+
916923
@staticmethod
917924
def fromdict(d):
918925
ret = ZarrConversionSpec(**d)
919-
ret.variables = [ZarrColumnSpec(**cd) for cd in d["variables"]]
926+
ret.columns = {key: ZarrColumnSpec(**value) for key,value in d["columns"].items()}
920927
return ret
921928

929+
@staticmethod
930+
def fromjson(s):
931+
return ZarrConversionSpec.fromdict(json.loads(s))
932+
922933
@staticmethod
923934
def generate(pcvcf, chunk_length=None, chunk_width=None):
924935
m = pcvcf.num_records
@@ -1070,7 +1081,7 @@ def fixed_field_spec(
10701081
return ZarrConversionSpec(
10711082
chunk_width=chunk_width,
10721083
chunk_length=chunk_length,
1073-
variables=colspecs,
1084+
columns={col.name: col for col in colspecs},
10741085
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
10751086
sample_id=pcvcf.metadata.samples,
10761087
contig_id=pcvcf.metadata.contig_names,
@@ -1261,8 +1272,8 @@ def convert(
12611272
logger.info(f"Create zarr at {write_path}")
12621273
sgvcf = SgvcfZarr(write_path)
12631274
sgvcf.root = zarr.group(store=store, overwrite=True)
1264-
for variable in conversion_spec.variables[:]:
1265-
sgvcf.create_array(variable)
1275+
for column in conversion_spec.columns.values():
1276+
sgvcf.create_array(column)
12661277

12671278
progress_config = core.ProgressConfig(
12681279
total=pcvcf.total_uncompressed_bytes,
@@ -1287,7 +1298,7 @@ def convert(
12871298
)
12881299
pwm.submit(sgvcf.encode_filters, pcvcf, conversion_spec.filter_id)
12891300
has_gt = False
1290-
for variable in conversion_spec.variables[:]:
1301+
for variable in conversion_spec.columns.values():
12911302
if variable.vcf_field is not None:
12921303
# print("Encode", variable.name)
12931304
# TODO for large columns it's probably worth splitting up
@@ -1309,10 +1320,10 @@ def convert(
13091320
os.rename(write_path, path)
13101321

13111322

1312-
def generate_spec(if_path, out):
1323+
def mkschema(if_path, out):
13131324
pcvcf = PickleChunkedVcf.load(if_path)
13141325
spec = ZarrConversionSpec.generate(pcvcf)
1315-
json.dump(spec.asdict(), out, indent=4)
1326+
out.write(spec.asjson())
13161327

13171328

13181329
def to_zarr(
@@ -1323,8 +1334,7 @@ def to_zarr(
13231334
spec = ZarrConversionSpec.generate(pcvcf)
13241335
else:
13251336
with open(conversion_spec, "r") as f:
1326-
d = json.load(f)
1327-
spec = ZarrConversionSpec.fromdict(d)
1337+
spec = ZarrConversionSpec.fromjson(f.read())
13281338
SgvcfZarr.convert(
13291339
pcvcf,
13301340
zarr_path,

tests/test_cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,17 @@ def test_inspect(self):
3333
assert result.stdout == "\n"
3434
assert len(result.stderr) == 0
3535
mocked.assert_called_once_with("path")
36+
37+
def test_mkschema(self):
38+
runner = ct.CliRunner(mix_stderr=False)
39+
with mock.patch("bio2zarr.vcf.mkschema") as mocked:
40+
result = runner.invoke(
41+
cli.vcf2zarr, ["mkschema", "path"], catch_exceptions=False
42+
)
43+
assert result.exit_code == 0
44+
assert len(result.stdout) == 0
45+
assert len(result.stderr) == 0
46+
# TODO figure out how to test that we call it with stdout from
47+
# the CliRunner
48+
# mocked.assert_called_once_with("path", stdout)
49+
mocked.assert_called_once()

tests/test_pcvcf.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ def pcvcf(self, tmp_path_factory):
2222
out = tmp_path_factory.mktemp("data") / "example.exploded"
2323
return vcf.explode([self.data_path], out)
2424

25+
def test_mkschema(self, tmp_path, pcvcf):
26+
schema_file = tmp_path / "schema.json"
27+
with open(schema_file, "w") as f:
28+
vcf.mkschema(pcvcf.path, f)
29+
with open(schema_file, "r") as f:
30+
schema1 = vcf.ZarrConversionSpec.fromjson(f.read())
31+
schema2 = vcf.ZarrConversionSpec.generate(pcvcf)
32+
assert schema1 == schema2
33+
2534
def test_summary_table(self, pcvcf):
2635
data = pcvcf.summary_table()
2736
cols = [d["name"] for d in data]
@@ -110,8 +119,7 @@ def schema(self, pcvcf):
110119
],
111120
)
112121
def test_info_schemas(self, schema, name, dtype, shape):
113-
variables = [v for v in schema.variables if v.name == name]
114-
v = variables[0]
122+
v = schema.columns[name]
115123
assert v.dtype == dtype
116124
assert tuple(v.shape) == shape
117125

tests/test_vcf_examples.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def test_full_pipeline(self, ds, tmp_path, worker_processes):
293293
)
294294
schema = tmp_path / "schema.json"
295295
with open(schema, "w") as f:
296-
vcf.generate_spec(exploded, f)
296+
vcf.mkschema(exploded, f)
297297
out = tmp_path / "example.zarr"
298298
vcf.to_zarr(exploded, out, schema, worker_processes=worker_processes)
299299
ds2 = sg.load_dataset(out)

0 commit comments

Comments
 (0)