From 89ea29d50ff1b91bef6c5447bd1c4083add4efb5 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Tue, 15 Apr 2025 12:43:07 -0700 Subject: [PATCH 1/4] feature(data-types): add datetime64, fixed-length-ascii, fixed-length-bytes, and fixed-length-ucs4 data-types from zarr-python --- data-types/datetime64/README.md | 33 +++++++++++++++++++++++ data-types/datetime64/schema.json | 26 ++++++++++++++++++ data-types/fixed-length-ascii/README.md | 33 +++++++++++++++++++++++ data-types/fixed-length-ascii/schema.json | 26 ++++++++++++++++++ data-types/fixed-length-bytes/README.md | 33 +++++++++++++++++++++++ data-types/fixed-length-bytes/schema.json | 26 ++++++++++++++++++ data-types/fixed-length-ucs4/README.md | 33 +++++++++++++++++++++++ data-types/fixed-length-ucs4/schema.json | 26 ++++++++++++++++++ 8 files changed, 236 insertions(+) create mode 100644 data-types/datetime64/README.md create mode 100644 data-types/datetime64/schema.json create mode 100644 data-types/fixed-length-ascii/README.md create mode 100644 data-types/fixed-length-ascii/schema.json create mode 100644 data-types/fixed-length-bytes/README.md create mode 100644 data-types/fixed-length-bytes/schema.json create mode 100644 data-types/fixed-length-ucs4/README.md create mode 100644 data-types/fixed-length-ucs4/schema.json diff --git a/data-types/datetime64/README.md b/data-types/datetime64/README.md new file mode 100644 index 0000000..cbba790 --- /dev/null +++ b/data-types/datetime64/README.md @@ -0,0 +1,33 @@ +# Datetime64 data type + +Defines a data type for a datetime object based on a 64-bit integer. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a signed 64-bit integer. + +## Example + +For example, the array metadata below specifies that the array uses the datetime64 data type: + +```json +{ + "data_type": "datetime64", + "fill_value": -9223372036854775808, + "configuration": { + "unit": "s" + }, +} +``` + +## Notes + +Valid values for the `unit` configuration option include: `["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"]` + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/data-types/datetime64/schema.json b/data-types/datetime64/schema.json new file mode 100644 index 0000000..af01dc1 --- /dev/null +++ b/data-types/datetime64/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "datetime64" + }, + "configuration": { + "type": "object", + "properties": { + "unit": { + "type": "string" + } + }, + "required": ["unit"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "datetime64" } + ] + } \ No newline at end of file diff --git a/data-types/fixed-length-ascii/README.md b/data-types/fixed-length-ascii/README.md new file mode 100644 index 0000000..befc01a --- /dev/null +++ b/data-types/fixed-length-ascii/README.md @@ -0,0 +1,33 @@ +# Fixed-length ASCII data type + +Defines a data type for fixed-length ASCII strings. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a string. + +## Example + +For example, the array metadata below specifies that the array contains fixed-length ASCII strings: + +```json +{ + "data_type": "fixed-length-ascii", + "fill_value": "", + "configuration": { + "length_bits": 24 + }, +} +``` + +## Notes + +TBD + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-ascii/schema.json b/data-types/fixed-length-ascii/schema.json new file mode 100644 index 0000000..53fb75f --- /dev/null +++ b/data-types/fixed-length-ascii/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed-length-ascii" + }, + "configuration": { + "type": "object", + "properties": { + "length_bits": { + "type": "integer" + } + }, + "required": ["length_bits"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed-length-ascii" } + ] +} \ No newline at end of file diff --git a/data-types/fixed-length-bytes/README.md b/data-types/fixed-length-bytes/README.md new file mode 100644 index 0000000..107dcc8 --- /dev/null +++ b/data-types/fixed-length-bytes/README.md @@ -0,0 +1,33 @@ +# Fixed-length bytes data type + +Defines a data type for fixed-length byte strings. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a string. + +## Example + +For example, the array metadata below specifies that the array contains fixed-length byte strings: + +```json +{ + "data_type": "fixed-length-bytes", + "fill_value": "", + "configuration": { + "length_bits": 24 + }, +} +``` + +## Notes + +TBD + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-bytes/schema.json b/data-types/fixed-length-bytes/schema.json new file mode 100644 index 0000000..22861cd --- /dev/null +++ b/data-types/fixed-length-bytes/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed-length-bytes" + }, + "configuration": { + "type": "object", + "properties": { + "length_bits": { + "type": "integer" + } + }, + "required": ["length_bits"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed-length-bytes" } + ] +} \ No newline at end of file diff --git a/data-types/fixed-length-ucs4/README.md b/data-types/fixed-length-ucs4/README.md new file mode 100644 index 0000000..3f95781 --- /dev/null +++ b/data-types/fixed-length-ucs4/README.md @@ -0,0 +1,33 @@ +# Fixed-length Unicode string data type + +Defines a data type for fixed-length Unicode strings. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a string. + +## Example + +For example, the array metadata below specifies that the array contains fixed-length unicode strings: + +```json +{ + "data_type": "fixed-length-ucs4", + "fill_value": "", + "configuration": { + "length_bits": 24 + }, +} +``` + +## Notes + +TBD + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-ucs4/schema.json b/data-types/fixed-length-ucs4/schema.json new file mode 100644 index 0000000..41bb363 --- /dev/null +++ b/data-types/fixed-length-ucs4/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed-length-ucs4" + }, + "configuration": { + "type": "object", + "properties": { + "length_bits": { + "type": "integer" + } + }, + "required": ["length_bits"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed-length-ucs4" } + ] +} \ No newline at end of file From 1be0c9eff89f7571dee3118dcdca9a358462c3f3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 27 May 2025 16:40:39 +0200 Subject: [PATCH 2/4] remove datetime stuff, rename ucs4 to utf32, fill out schema --- data-types/datetime64/README.md | 33 ------------ data-types/datetime64/schema.json | 26 ---------- data-types/fixed-length-ucs4/README.md | 33 ------------ data-types/fixed-length-utf32/README.md | 52 +++++++++++++++++++ .../schema.json | 6 +-- 5 files changed, 55 insertions(+), 95 deletions(-) delete mode 100644 data-types/datetime64/README.md delete mode 100644 data-types/datetime64/schema.json delete mode 100644 data-types/fixed-length-ucs4/README.md create mode 100644 data-types/fixed-length-utf32/README.md rename data-types/{fixed-length-ucs4 => fixed-length-utf32}/schema.json (80%) diff --git a/data-types/datetime64/README.md b/data-types/datetime64/README.md deleted file mode 100644 index cbba790..0000000 --- a/data-types/datetime64/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Datetime64 data type - -Defines a data type for a datetime object based on a 64-bit integer. - -## Permitted fill values - -The value of the `fill_value` metadata key must be a signed 64-bit integer. - -## Example - -For example, the array metadata below specifies that the array uses the datetime64 data type: - -```json -{ - "data_type": "datetime64", - "fill_value": -9223372036854775808, - "configuration": { - "unit": "s" - }, -} -``` - -## Notes - -Valid values for the `unit` configuration option include: `["Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"]` - -## Change log - -No changes yet. - -## Current maintainers - -* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) diff --git a/data-types/datetime64/schema.json b/data-types/datetime64/schema.json deleted file mode 100644 index af01dc1..0000000 --- a/data-types/datetime64/schema.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "oneOf": [ - { - "type": "object", - "properties": { - "name": { - "const": "datetime64" - }, - "configuration": { - "type": "object", - "properties": { - "unit": { - "type": "string" - } - }, - "required": ["unit"], - "additionalProperties": false - } - }, - "required": ["name", "configuration"], - "additionalProperties": false - }, - { "const": "datetime64" } - ] - } \ No newline at end of file diff --git a/data-types/fixed-length-ucs4/README.md b/data-types/fixed-length-ucs4/README.md deleted file mode 100644 index 3f95781..0000000 --- a/data-types/fixed-length-ucs4/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Fixed-length Unicode string data type - -Defines a data type for fixed-length Unicode strings. - -## Permitted fill values - -The value of the `fill_value` metadata key must be a string. - -## Example - -For example, the array metadata below specifies that the array contains fixed-length unicode strings: - -```json -{ - "data_type": "fixed-length-ucs4", - "fill_value": "", - "configuration": { - "length_bits": 24 - }, -} -``` - -## Notes - -TBD - -## Change log - -No changes yet. - -## Current maintainers - -* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-utf32/README.md b/data-types/fixed-length-utf32/README.md new file mode 100644 index 0000000..dfbbad4 --- /dev/null +++ b/data-types/fixed-length-utf32/README.md @@ -0,0 +1,52 @@ +# `fixed_length_utf32` data type + +This document defines a data type for fixed-length Unicode strings encoded using [UTF-32](https://www.unicode.org/versions/Unicode5.0.0/appC.pdf#M9.19040.HeadingAppendix.C2.Encoding.Forms.in.ISOIEC.10646). UTF-32, also known as UCS4, is an encoding of Unicode strings that allocates 4 bytes to each Unicode code point. + +"Fixed length" as used here means that the `fixed_length_utf32` data type is parametrized by a integral length, which sets a fixed length for every scalar belonging to that data type. + +### Name + +The name of this data type is the string `"fixed_length_utf32"` + +### Configuration + +This data type requires a configuration. The configuration for this data type is a JSON object with the following fields: + +| field name | type | required | notes | +|------------|----------|---|---| +| `"length_bytes"` | integer | yes | The number MUST represent an integer divisible by 4 in the inclusive range `[0, 2147483644]` | + +> Note: the maximum length of 2147483644 was chosen to match the semantics of the [NumPy `"U"` data type](https://numpy.org/devdocs/reference/arrays.scalars.html#numpy.str_), which as of this writing has a maximum length in code points of 536870911, i.e. 2147483644 / 4. + +> Note: given a particular `fixed_length_utf32` data type, the length of an array element in Unicode code points is the value of the `length_bytes` field divided by 4. + +### Examples + +```json +{ + "name": "fixed_length_utf32", + "configuration" : { + "length_bytes": 4 + } +} +``` + +## Fill value representation + +The value of the `fill_value` metadata key must be a string. When encoded in UTF-32, the fill value MUST have a length in bytes equal to the value of the `length_bytes` specified in the `configuration` of this data type. + +## Codec compatibility + +This data type is compatible with any codec that supports arrays with fixed-sized data types. + +## Notes + +This data type is designed for NumPy compatibility. UTF-32 is not a good fit for many applications that need to model arrays of strings, as real string datasets are often composed of variable-length strings. A variable-length string data type should be preferred in these cases. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-ucs4/schema.json b/data-types/fixed-length-utf32/schema.json similarity index 80% rename from data-types/fixed-length-ucs4/schema.json rename to data-types/fixed-length-utf32/schema.json index 41bb363..fad9d63 100644 --- a/data-types/fixed-length-ucs4/schema.json +++ b/data-types/fixed-length-utf32/schema.json @@ -5,7 +5,7 @@ "type": "object", "properties": { "name": { - "const": "fixed-length-ucs4" + "const": "fixed_length_utf32" }, "configuration": { "type": "object", @@ -14,13 +14,13 @@ "type": "integer" } }, - "required": ["length_bits"], + "required": ["length_bytes"], "additionalProperties": false } }, "required": ["name", "configuration"], "additionalProperties": false }, - { "const": "fixed-length-ucs4" } + { "const": "fixed_length_utf32" } ] } \ No newline at end of file From bbc69d12e88c8d77f781f632bf7ef944fd5823e9 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 2 Jun 2025 19:36:08 +0200 Subject: [PATCH 3/4] Update data-types/fixed-length-utf32/schema.json Co-authored-by: Norman Rzepka --- data-types/fixed-length-utf32/schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-types/fixed-length-utf32/schema.json b/data-types/fixed-length-utf32/schema.json index fad9d63..3135246 100644 --- a/data-types/fixed-length-utf32/schema.json +++ b/data-types/fixed-length-utf32/schema.json @@ -10,7 +10,7 @@ "configuration": { "type": "object", "properties": { - "length_bits": { + "length_bytes": { "type": "integer" } }, From 7894f9b3abe69da4f7c5bcfe33066b72b5335000 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 3 Jun 2025 16:33:50 +0200 Subject: [PATCH 4/4] add null terminated clarification --- data-types/fixed-length-utf32/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data-types/fixed-length-utf32/README.md b/data-types/fixed-length-utf32/README.md index dfbbad4..81ab3ed 100644 --- a/data-types/fixed-length-utf32/README.md +++ b/data-types/fixed-length-utf32/README.md @@ -1,9 +1,11 @@ # `fixed_length_utf32` data type -This document defines a data type for fixed-length Unicode strings encoded using [UTF-32](https://www.unicode.org/versions/Unicode5.0.0/appC.pdf#M9.19040.HeadingAppendix.C2.Encoding.Forms.in.ISOIEC.10646). UTF-32, also known as UCS4, is an encoding of Unicode strings that allocates 4 bytes to each Unicode code point. +This document defines a data type for fixed-length, null-terminated Unicode strings encoded using [UTF-32](https://www.unicode.org/versions/Unicode5.0.0/appC.pdf#M9.19040.HeadingAppendix.C2.Encoding.Forms.in.ISOIEC.10646). UTF-32, also known as UCS4, is an encoding of Unicode strings that allocates 4 bytes to each Unicode code point. "Fixed length" as used here means that the `fixed_length_utf32` data type is parametrized by a integral length, which sets a fixed length for every scalar belonging to that data type. +"Null-terminated" as used here means that, for an integral length `L`, a `fixed_length_utf32` data type parameterized with `L` can represent a string shorter than `L` by adding null bytes to the end of that string until it has length `L`. + ### Name The name of this data type is the string `"fixed_length_utf32"` @@ -33,7 +35,7 @@ This data type requires a configuration. The configuration for this data type is ## Fill value representation -The value of the `fill_value` metadata key must be a string. When encoded in UTF-32, the fill value MUST have a length in bytes equal to the value of the `length_bytes` specified in the `configuration` of this data type. +The value of the `fill_value` metadata key must be a string. When encoded in UTF-32, the fill value MUST have a length in bytes less than or equal to the value of the `length_bytes` specified in the `configuration` of this data type. ## Codec compatibility