diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index 8f27d30..686258a 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -63,13 +63,15 @@ vcz_itoa(char *restrict buf, int64_t value) } int -vcz_ftoa(char *restrict buf, float value) +vcz_ftoa(char *restrict buf, float value, int8_t precision) { int p = 0; - int64_t i, d1, d2, d3; + int64_t i; + + precision = 9; if (!isfinite(value) || fabs(value) > INT32_MAX + 1LL) { - return sprintf(buf, "%.3f", value); + return sprintf(buf, "%.*f", precision, value); } if (value < 0) { @@ -79,27 +81,37 @@ vcz_ftoa(char *restrict buf, float value) } /* integer part */ - i = (int64_t) round(((double) value) * 1000); - p += vcz_itoa(buf + p, i / 1000); - - /* fractional part */ - d3 = i % 10; - d2 = (i / 10) % 10; - d1 = (i / 100) % 10; - if (d1 + d2 + d3 > 0) { - buf[p] = '.'; - p++; - buf[p] = (char) d1 + '0'; - p++; - if (d2 + d3 > 0) { - buf[p] = (char) d2 + '0'; + int magnitudes[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; + + i = (int64_t) round(((double) value) * magnitudes[precision]); + p += vcz_itoa(buf + p, i / magnitudes[precision]); + + int d[precision]; + for (int j = 0; j < precision; j++) { + d[precision - 1- j ] = (i / magnitudes[j]) % 10; + } + + buf[p] = '.'; + p++; + + int r = 0; + for (int j = 0; j < precision; j++) { + int sum_ = 0; + for (int h = r; h < precision; h++) { + sum_ += d[h]; + }; + if (sum_ > 0) { + buf[p] = (char) d[r] + '0'; p++; - if (d3 > 0) { - buf[p] = (char) d3 + '0'; - p++; - } - } + }; + r++; } + + if (buf[p-1] == '.') { + buf[p] = '0'; + p++; + } + buf[p] = '\0'; return p; } @@ -146,7 +158,7 @@ append_int(char *restrict buf, int32_t value, int64_t offset, int64_t buflen) static inline int64_t append_float( - char *restrict buf, int32_t int32_value, float value, int64_t offset, int64_t buflen) + char *restrict buf, int32_t int32_value, float value, int8_t precision, int64_t offset, int64_t buflen) { char tmp[VCZ_FLOAT32_BUF_SIZE]; int64_t len; @@ -154,7 +166,7 @@ append_float( if (int32_value == VCZ_FLOAT32_MISSING_AS_INT32) { return append_char(buf, '.', offset, buflen); } - len = vcz_ftoa(tmp, value); + len = vcz_ftoa(tmp, value, precision); /* printf("%f: %d\n", value, (int) len); */ return append_string(buf, tmp, len, offset, buflen); } @@ -342,7 +354,7 @@ int32_write_entry(size_t num_columns, const void *restrict data, char *buf, } static int64_t -float32_write_entry(size_t num_columns, const void *restrict data, char *restrict buf, +float32_write_entry(size_t num_columns, const void *restrict data, int8_t precision, char *restrict buf, int64_t buflen, int64_t offset) { const float *restrict source = (const float *restrict) data; @@ -361,7 +373,7 @@ float32_write_entry(size_t num_columns, const void *restrict data, char *restric goto out; } } - offset = append_float(buf, int32_value, source[column], offset, buflen); + offset = append_float(buf, int32_value, source[column], precision, offset, buflen); if (offset < 0) { goto out; } @@ -371,7 +383,7 @@ float32_write_entry(size_t num_columns, const void *restrict data, char *restric } static int64_t -write_entry(int type, size_t item_size, size_t num_columns, const void *data, char *buf, +write_entry(int type, size_t item_size, size_t num_columns, const void *data, int8_t precision, char *buf, int64_t buflen, int64_t offset) { if (type == VCZ_TYPE_INT) { @@ -386,7 +398,7 @@ write_entry(int type, size_t item_size, size_t num_columns, const void *data, ch } } else if (type == VCZ_TYPE_FLOAT) { assert(item_size == 4); - return float32_write_entry(num_columns, data, buf, buflen, offset); + return float32_write_entry(num_columns, data, precision, buf, buflen, offset); } else if (type == VCZ_TYPE_BOOL) { assert(item_size == 1); assert(num_columns == 1); @@ -423,13 +435,13 @@ all_missing(int type, size_t item_size, size_t n, const char *restrict data) int64_t vcz_field_write_1d( - const vcz_field_t *self, size_t variant, char *buf, int64_t buflen, int64_t offset) + const vcz_field_t *self, size_t variant, int8_t precision, char *buf, int64_t buflen, int64_t offset) { size_t row_size = self->num_columns * self->item_size; const void *data = self->data + variant * row_size; return write_entry( - self->type, self->item_size, self->num_columns, data, buf, buflen, offset); + self->type, self->item_size, self->num_columns, data, precision, buf, buflen, offset); } static bool @@ -443,13 +455,13 @@ vcz_field_is_missing_1d(const vcz_field_t *self, size_t variant) static int64_t vcz_field_write_2d(const vcz_field_t *self, size_t variant, size_t num_samples, - size_t sample, char *buf, int64_t buflen, int64_t offset) + size_t sample, int8_t precision, char *buf, int64_t buflen, int64_t offset) { size_t row_size = self->num_columns * self->item_size * num_samples; const void *data = self->data + variant * row_size + sample * self->num_columns * self->item_size; return write_entry( - self->type, self->item_size, self->num_columns, data, buf, buflen, offset); + self->type, self->item_size, self->num_columns, data, precision, buf, buflen, offset); } static bool @@ -605,7 +617,7 @@ vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t if (offset < 0) { goto out; } - offset = vcz_field_write_1d(&field, variant, buf, buflen, offset); + offset = vcz_field_write_1d(&field, variant, self->precision, buf, buflen, offset); if (offset < 0) { goto out; } @@ -703,7 +715,7 @@ vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, if (!missing[j]) { field = self->format_fields[j]; offset = vcz_field_write_2d(&self->format_fields[j], variant, - num_samples, sample, buf, buflen, offset); + num_samples, sample, self->precision, buf, buflen, offset); if (offset < 0) { goto out; } @@ -798,7 +810,7 @@ vcz_variant_encoder_encode( } } else { offset = vcz_field_write_1d( - &self->fixed_fields[j], variant, buf, (int64_t) buflen, offset); + &self->fixed_fields[j], variant, self->precision, buf, (int64_t) buflen, offset); if (offset < 0) { goto out; } diff --git a/lib/vcf_encoder.h b/lib/vcf_encoder.h index b2d6291..f2b0461 100644 --- a/lib/vcf_encoder.h +++ b/lib/vcf_encoder.h @@ -59,7 +59,7 @@ typedef struct { int vcz_field_init(vcz_field_t *self, const char *name, int type, size_t item_size, size_t num_columns, const void *data); int64_t vcz_field_write_1d( - const vcz_field_t *self, size_t row, char *buf, int64_t buflen, int64_t offset); + const vcz_field_t *self, size_t row, int8_t precision, char *buf, int64_t buflen, int64_t offset); void vcz_field_print_state(const vcz_field_t *self, FILE *out); typedef struct { @@ -77,6 +77,7 @@ typedef struct { size_t max_format_fields; size_t field_array_size_increment; vcz_field_t *format_fields; + int8_t precision; } vcz_variant_encoder_t; int vcz_variant_encoder_init( @@ -108,7 +109,7 @@ int64_t vcz_variant_encoder_encode( const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen); int vcz_itoa(char *buf, int64_t v); -int vcz_ftoa(char *buf, float v); +int vcz_ftoa(char *buf, float v, int8_t precision); #define VCZ_PLINK_HOM_A1 0x0 /* 00 */ diff --git a/tests/test_ftoa.py b/tests/test_ftoa.py new file mode 100644 index 0000000..0785b9c --- /dev/null +++ b/tests/test_ftoa.py @@ -0,0 +1,14 @@ +import pytest + +from vcztools.ftoa import ftoa + +@pytest.mark.parametrize( + ("float_", "string_", "precision"), + [ + (1.12345, "1.123", 3), + (1.12345, "1.1234", 4), + (1.12345, "1.12345", 5), + ] +) +def test_ftoa(float_, string_, precision): + assert ftoa(float_, precision) == string_ diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 9ecd538..b071764 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -214,7 +214,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) { int ret = -1; static char *kwlist[] = { "num_variants", "num_samples", "chrom", "pos", "id", "ref", - "alt", "qual", "filter_ids", "filter", NULL }; + "alt", "qual", "filter_ids", "filter", "precision", NULL }; int num_variants, num_samples; PyArrayObject *chrom = NULL; PyArrayObject *pos = NULL; @@ -224,6 +224,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) PyArrayObject *qual = NULL; PyArrayObject *filter_ids = NULL; PyArrayObject *filter = NULL; + int precision = NULL; npy_intp num_filters; int err; @@ -236,10 +237,10 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) goto out; // GCOVR_EXCL_LINE } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!i", kwlist, &num_variants, &num_samples, &PyArray_Type, &chrom, &PyArray_Type, &pos, &PyArray_Type, &id, &PyArray_Type, &ref, &PyArray_Type, &alt, &PyArray_Type, - &qual, &PyArray_Type, &filter_ids, &PyArray_Type, &filter)) { + &qual, &PyArray_Type, &filter_ids, &PyArray_Type, &filter, &precision)) { goto out; } diff --git a/vcztools/cli.py b/vcztools/cli.py index 7ccb634..7741bd6 100644 --- a/vcztools/cli.py +++ b/vcztools/cli.py @@ -83,6 +83,13 @@ def wrapper(*args, **kwargs): default=None, help="Target regions to include.", ) +precision = click.option( + "-p", + "--precision", + type=int, + default=3, + help="precision level for writing floats", +) version = click.version_option(version=f"{provenance.__version__}") @@ -148,6 +155,7 @@ def index(path, nrecords, stats): @targets @include @exclude +@precision @handle_exception def query( path, @@ -160,6 +168,7 @@ def query( samples, include, exclude, + precision, ): """ Transform VCZ into user-defined formats with efficient subsetting and @@ -190,6 +199,7 @@ def query( force_samples=force_samples, include=include, exclude=exclude, + precision=precision, ) @@ -238,6 +248,7 @@ def query( @targets @include @exclude +@precision @handle_exception def view( path, @@ -254,6 +265,7 @@ def view( drop_genotypes, include, exclude, + precision, ): """ Convert VCZ dataset to VCF with efficient subsetting and filtering. @@ -300,6 +312,7 @@ def view( drop_genotypes=drop_genotypes, include=include, exclude=exclude, + precision=precision, ) diff --git a/vcztools/ftoa.py b/vcztools/ftoa.py new file mode 100644 index 0000000..99bbcc6 --- /dev/null +++ b/vcztools/ftoa.py @@ -0,0 +1,29 @@ + + +def ftoa(f, precision=3): + buf = "" + + if f < 0: + buf += '-' + f = -f + + i = round(f * 10**precision) + buf += str(i // 10**precision) + + d = [0] * precision + + for idx in range(precision): + d[idx] = (i // 10**idx) % 10 + + buf += "." + for idx in range(precision): + sum_ = 0 + for j in d: + sum_ += j + if sum_ > 0: + buf += str(d.pop()) + + if buf[-1] == ".": + buf += "0" + + return buf diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index 8b8b0cd..11f6e66 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -86,6 +86,7 @@ def write_vcf( drop_genotypes: bool = False, include: str | None = None, exclude: str | None = None, + precision ) -> None: root = zarr.open(vcz, mode="r") @@ -138,6 +139,7 @@ def write_vcf( output, drop_genotypes=drop_genotypes, no_update=no_update, + precision=precision, ) @@ -150,6 +152,7 @@ def c_chunk_to_vcf( *, drop_genotypes, no_update, + precision, ): format_fields = {} info_fields = {} @@ -240,6 +243,7 @@ def c_chunk_to_vcf( qual=qual, filter_ids=filters, filter=filter_, + precision = precision, ) # print(encoder.arrays) if gt is not None: