Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 47 additions & 35 deletions lib/vcf_encoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ vcz_itoa(char *restrict buf, int64_t value)
}

int
vcz_ftoa(char *restrict buf, float value)
vcz_ftoa(char *restrict buf, float value, int8_t precision)
{
int p = 0;
int64_t i, d1, d2, d3;
int64_t i;

precision = 9;

if (!isfinite(value) || fabs(value) > INT32_MAX + 1LL) {
return sprintf(buf, "%.3f", value);
return sprintf(buf, "%.*f", precision, value);
}

if (value < 0) {
Expand All @@ -79,27 +81,37 @@ vcz_ftoa(char *restrict buf, float value)
}

/* integer part */
i = (int64_t) round(((double) value) * 1000);
p += vcz_itoa(buf + p, i / 1000);

/* fractional part */
d3 = i % 10;
d2 = (i / 10) % 10;
d1 = (i / 100) % 10;
if (d1 + d2 + d3 > 0) {
buf[p] = '.';
p++;
buf[p] = (char) d1 + '0';
p++;
if (d2 + d3 > 0) {
buf[p] = (char) d2 + '0';
int magnitudes[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};

i = (int64_t) round(((double) value) * magnitudes[precision]);
p += vcz_itoa(buf + p, i / magnitudes[precision]);

int d[precision];
for (int j = 0; j < precision; j++) {
d[precision - 1- j ] = (i / magnitudes[j]) % 10;
}

buf[p] = '.';
p++;

int r = 0;
for (int j = 0; j < precision; j++) {
int sum_ = 0;
for (int h = r; h < precision; h++) {
sum_ += d[h];
};
if (sum_ > 0) {
buf[p] = (char) d[r] + '0';
p++;
if (d3 > 0) {
buf[p] = (char) d3 + '0';
p++;
}
}
};
r++;
}

if (buf[p-1] == '.') {
buf[p] = '0';
p++;
}

buf[p] = '\0';
return p;
}
Expand Down Expand Up @@ -146,15 +158,15 @@ append_int(char *restrict buf, int32_t value, int64_t offset, int64_t buflen)

static inline int64_t
append_float(
char *restrict buf, int32_t int32_value, float value, int64_t offset, int64_t buflen)
char *restrict buf, int32_t int32_value, float value, int8_t precision, int64_t offset, int64_t buflen)
{
char tmp[VCZ_FLOAT32_BUF_SIZE];
int64_t len;

if (int32_value == VCZ_FLOAT32_MISSING_AS_INT32) {
return append_char(buf, '.', offset, buflen);
}
len = vcz_ftoa(tmp, value);
len = vcz_ftoa(tmp, value, precision);
/* printf("%f: %d\n", value, (int) len); */
return append_string(buf, tmp, len, offset, buflen);
}
Expand Down Expand Up @@ -342,7 +354,7 @@ int32_write_entry(size_t num_columns, const void *restrict data, char *buf,
}

static int64_t
float32_write_entry(size_t num_columns, const void *restrict data, char *restrict buf,
float32_write_entry(size_t num_columns, const void *restrict data, int8_t precision, char *restrict buf,
int64_t buflen, int64_t offset)
{
const float *restrict source = (const float *restrict) data;
Expand All @@ -361,7 +373,7 @@ float32_write_entry(size_t num_columns, const void *restrict data, char *restric
goto out;
}
}
offset = append_float(buf, int32_value, source[column], offset, buflen);
offset = append_float(buf, int32_value, source[column], precision, offset, buflen);
if (offset < 0) {
goto out;
}
Expand All @@ -371,7 +383,7 @@ float32_write_entry(size_t num_columns, const void *restrict data, char *restric
}

static int64_t
write_entry(int type, size_t item_size, size_t num_columns, const void *data, char *buf,
write_entry(int type, size_t item_size, size_t num_columns, const void *data, int8_t precision, char *buf,
int64_t buflen, int64_t offset)
{
if (type == VCZ_TYPE_INT) {
Expand All @@ -386,7 +398,7 @@ write_entry(int type, size_t item_size, size_t num_columns, const void *data, ch
}
} else if (type == VCZ_TYPE_FLOAT) {
assert(item_size == 4);
return float32_write_entry(num_columns, data, buf, buflen, offset);
return float32_write_entry(num_columns, data, precision, buf, buflen, offset);
} else if (type == VCZ_TYPE_BOOL) {
assert(item_size == 1);
assert(num_columns == 1);
Expand Down Expand Up @@ -423,13 +435,13 @@ all_missing(int type, size_t item_size, size_t n, const char *restrict data)

int64_t
vcz_field_write_1d(
const vcz_field_t *self, size_t variant, char *buf, int64_t buflen, int64_t offset)
const vcz_field_t *self, size_t variant, int8_t precision, char *buf, int64_t buflen, int64_t offset)
{
size_t row_size = self->num_columns * self->item_size;
const void *data = self->data + variant * row_size;

return write_entry(
self->type, self->item_size, self->num_columns, data, buf, buflen, offset);
self->type, self->item_size, self->num_columns, data, precision, buf, buflen, offset);
}

static bool
Expand All @@ -443,13 +455,13 @@ vcz_field_is_missing_1d(const vcz_field_t *self, size_t variant)

static int64_t
vcz_field_write_2d(const vcz_field_t *self, size_t variant, size_t num_samples,
size_t sample, char *buf, int64_t buflen, int64_t offset)
size_t sample, int8_t precision, char *buf, int64_t buflen, int64_t offset)
{
size_t row_size = self->num_columns * self->item_size * num_samples;
const void *data
= self->data + variant * row_size + sample * self->num_columns * self->item_size;
return write_entry(
self->type, self->item_size, self->num_columns, data, buf, buflen, offset);
self->type, self->item_size, self->num_columns, data, precision, buf, buflen, offset);
}

static bool
Expand Down Expand Up @@ -605,7 +617,7 @@ vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t
if (offset < 0) {
goto out;
}
offset = vcz_field_write_1d(&field, variant, buf, buflen, offset);
offset = vcz_field_write_1d(&field, variant, self->precision, buf, buflen, offset);
if (offset < 0) {
goto out;
}
Expand Down Expand Up @@ -703,7 +715,7 @@ vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self,
if (!missing[j]) {
field = self->format_fields[j];
offset = vcz_field_write_2d(&self->format_fields[j], variant,
num_samples, sample, buf, buflen, offset);
num_samples, sample, self->precision, buf, buflen, offset);
if (offset < 0) {
goto out;
}
Expand Down Expand Up @@ -798,7 +810,7 @@ vcz_variant_encoder_encode(
}
} else {
offset = vcz_field_write_1d(
&self->fixed_fields[j], variant, buf, (int64_t) buflen, offset);
&self->fixed_fields[j], variant, self->precision, buf, (int64_t) buflen, offset);
if (offset < 0) {
goto out;
}
Expand Down
5 changes: 3 additions & 2 deletions lib/vcf_encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ typedef struct {
int vcz_field_init(vcz_field_t *self, const char *name, int type, size_t item_size,
size_t num_columns, const void *data);
int64_t vcz_field_write_1d(
const vcz_field_t *self, size_t row, char *buf, int64_t buflen, int64_t offset);
const vcz_field_t *self, size_t row, int8_t precision, char *buf, int64_t buflen, int64_t offset);
void vcz_field_print_state(const vcz_field_t *self, FILE *out);

typedef struct {
Expand All @@ -77,6 +77,7 @@ typedef struct {
size_t max_format_fields;
size_t field_array_size_increment;
vcz_field_t *format_fields;
int8_t precision;
} vcz_variant_encoder_t;

int vcz_variant_encoder_init(
Expand Down Expand Up @@ -108,7 +109,7 @@ int64_t vcz_variant_encoder_encode(
const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen);

int vcz_itoa(char *buf, int64_t v);
int vcz_ftoa(char *buf, float v);
int vcz_ftoa(char *buf, float v, int8_t precision);


#define VCZ_PLINK_HOM_A1 0x0 /* 00 */
Expand Down
14 changes: 14 additions & 0 deletions tests/test_ftoa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import pytest

from vcztools.ftoa import ftoa

@pytest.mark.parametrize(
("float_", "string_", "precision"),
[
(1.12345, "1.123", 3),
(1.12345, "1.1234", 4),
(1.12345, "1.12345", 5),
]
)
def test_ftoa(float_, string_, precision):
assert ftoa(float_, precision) == string_
7 changes: 4 additions & 3 deletions vcztools/_vcztoolsmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds)
{
int ret = -1;
static char *kwlist[] = { "num_variants", "num_samples", "chrom", "pos", "id", "ref",
"alt", "qual", "filter_ids", "filter", NULL };
"alt", "qual", "filter_ids", "filter", "precision", NULL };
int num_variants, num_samples;
PyArrayObject *chrom = NULL;
PyArrayObject *pos = NULL;
Expand All @@ -224,6 +224,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds)
PyArrayObject *qual = NULL;
PyArrayObject *filter_ids = NULL;
PyArrayObject *filter = NULL;
int precision = NULL;
npy_intp num_filters;
int err;

Expand All @@ -236,10 +237,10 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds)
goto out; // GCOVR_EXCL_LINE
}

if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!", kwlist,
if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!i", kwlist,
&num_variants, &num_samples, &PyArray_Type, &chrom, &PyArray_Type, &pos,
&PyArray_Type, &id, &PyArray_Type, &ref, &PyArray_Type, &alt, &PyArray_Type,
&qual, &PyArray_Type, &filter_ids, &PyArray_Type, &filter)) {
&qual, &PyArray_Type, &filter_ids, &PyArray_Type, &filter, &precision)) {
goto out;
}

Expand Down
13 changes: 13 additions & 0 deletions vcztools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def wrapper(*args, **kwargs):
default=None,
help="Target regions to include.",
)
precision = click.option(
"-p",
"--precision",
type=int,
default=3,
help="precision level for writing floats",
)
version = click.version_option(version=f"{provenance.__version__}")


Expand Down Expand Up @@ -148,6 +155,7 @@ def index(path, nrecords, stats):
@targets
@include
@exclude
@precision
@handle_exception
def query(
path,
Expand All @@ -160,6 +168,7 @@ def query(
samples,
include,
exclude,
precision,
):
"""
Transform VCZ into user-defined formats with efficient subsetting and
Expand Down Expand Up @@ -190,6 +199,7 @@ def query(
force_samples=force_samples,
include=include,
exclude=exclude,
precision=precision,
)


Expand Down Expand Up @@ -238,6 +248,7 @@ def query(
@targets
@include
@exclude
@precision
@handle_exception
def view(
path,
Expand All @@ -254,6 +265,7 @@ def view(
drop_genotypes,
include,
exclude,
precision,
):
"""
Convert VCZ dataset to VCF with efficient subsetting and filtering.
Expand Down Expand Up @@ -300,6 +312,7 @@ def view(
drop_genotypes=drop_genotypes,
include=include,
exclude=exclude,
precision=precision,
)


Expand Down
29 changes: 29 additions & 0 deletions vcztools/ftoa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@


def ftoa(f, precision=3):
buf = ""

if f < 0:
buf += '-'
f = -f

i = round(f * 10**precision)
buf += str(i // 10**precision)

d = [0] * precision

for idx in range(precision):
d[idx] = (i // 10**idx) % 10

buf += "."
for idx in range(precision):
sum_ = 0
for j in d:
sum_ += j
if sum_ > 0:
buf += str(d.pop())

if buf[-1] == ".":
buf += "0"

return buf
4 changes: 4 additions & 0 deletions vcztools/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def write_vcf(
drop_genotypes: bool = False,
include: str | None = None,
exclude: str | None = None,
precision
) -> None:
root = zarr.open(vcz, mode="r")

Expand Down Expand Up @@ -138,6 +139,7 @@ def write_vcf(
output,
drop_genotypes=drop_genotypes,
no_update=no_update,
precision=precision,
)


Expand All @@ -150,6 +152,7 @@ def c_chunk_to_vcf(
*,
drop_genotypes,
no_update,
precision,
):
format_fields = {}
info_fields = {}
Expand Down Expand Up @@ -240,6 +243,7 @@ def c_chunk_to_vcf(
qual=qual,
filter_ids=filters,
filter=filter_,
precision = precision,
)
# print(encoder.arrays)
if gt is not None:
Expand Down
Loading