add safetensors

nilchia · mvdbeek · commit 9896e9b49778 · 2025-10-06T20:33:59.000+02:00
remove safetensor dependency

keeping only sniff

update description

test header and add more comments and info about data

add test

test files match the files in galaxy-test-data

add test files

Update lib/galaxy/config/sample/datatypes_conf.xml.sample

Co-authored-by: M Bernt &lt;m.bernt@ufz.de&gt;

remove subclass=true

add Wolfgang's suggestion

black reformater

json check

using same function for json and safetensors

fix lint

reformat

fix isort lint

Rename test dataset

rename test file

try to use FilePrefix

add if for when the header is big

fix lint

Update lib/galaxy/datatypes/util/json.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

Update lib/galaxy/datatypes/binary.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

remove unused function

Update lib/galaxy/datatypes/binary.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

Restore text.py and remove json.py

check if header is dict

Update lib/galaxy/config/sample/datatypes_conf.xml.sample

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

remove pth file

Update lib/galaxy/datatypes/binary.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

Update lib/galaxy/datatypes/binary.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

Update lib/galaxy/datatypes/binary.py

Co-authored-by: Nicola Soranzo &lt;nicola.soranzo@gmail.com&gt;

correct test file name
diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample
@@ -1172,6 +1172,7 @@
     <datatype extension="bcsp" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Binary format of k-mer hash table which is only compatible with Fairy"/>
     <!-- rdeval types -->
     <datatype extension="rd" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Rdeval read sketch"/>
+    <datatype extension="safetensors" type="galaxy.datatypes.binary:Safetensors" mimetype="application/octet-stream" display_in_upload="true" description="A simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy)" description_url="https://huggingface.co/docs/safetensors/index"/>
   </registration>
 
   <sniffers>
diff --git a/lib/galaxy/datatypes/binary.py b/lib/galaxy/datatypes/binary.py
@@ -4847,3 +4847,102 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
         with open(dataset.get_file_name(), "rb") as handle:
             header_bytes = handle.read(8)
         dataset.metadata.version = struct.unpack("<i", header_bytes[4:8])[0]
+
+
+@build_sniff_from_prefix
+class Safetensors(Binary):
+    """
+    safetensors is a new simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy).
+    It provides a secure way to store and load tensors without the security risks associated with pickle-based formats.
+    Safetensors files consist of a JSON header followed by tensor data.
+    more info at: https://github.com/huggingface/safetensors
+    """
+
+    file_ext = "safetensors"
+
+    def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
+        """
+        Determining if the file is in safetensors format
+        >>> from galaxy.datatypes.sniff import get_test_fname
+        >>> fname = get_test_fname('cellpose_model_safetensors.safetensors')
+        >>> Safetensors().sniff(fname)
+        True
+        >>> fname = get_test_fname('test_charmm.vel')
+        >>> Safetensors().sniff(fname)
+        False
+        """
+        try:
+            # Safetensors files start with an 8-byte little-endian integer
+            # indicating the size of the JSON header
+            if len(file_prefix.contents_header_bytes) < 8:
+                return False
+
+            header_size = int.from_bytes(file_prefix.contents_header_bytes[:8], "little")
+
+            # Currently, there's a limit on the size of the header of 100MB to prevent parsing extremely large JSON headers
+            # In practice, safetensors headers are typically just a few KB to MB
+            # (containing tensor names, shapes, dtypes, and offsets - rarely exceeds 1-10MB even for large models)
+            # But in theory it is possible to have 100 MB header
+            # more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#benefits
+            if header_size == 0 or header_size > 10**8:  # 100MB max for JSON header
+                return False
+
+            # Check if file is large enough to contain the full header
+            if file_prefix.file_size < 8 + header_size:
+                return False
+
+            # CRITICAL: Check if header begins with '{' character (0x7B) as per safetensors spec
+            # This is required by the format and helps distinguish from other binary formats
+            # Only check 1 byte to avoid issues with malicious header_size values
+            # more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#format
+            if file_prefix.contents_header_bytes[8] != 0x7B:
+                return False
+
+            # Check if header ends with '}' character (0x7D) as per safetensors spec
+            # This requires reading more data if header extends beyond the prefix
+            header_end_pos = 8 + header_size - 1
+            if header_end_pos < len(file_prefix.contents_header_bytes):
+                # Header end is within the prefix
+                if file_prefix.contents_header_bytes[header_end_pos] != 0x7D:
+                    return False
+            else:
+                # Header extends beyond prefix, need to check from file
+                with open(file_prefix.filename, "rb") as f:
+                    f.seek(header_end_pos)
+                    last_header_byte = f.read(1)
+                    if len(last_header_byte) != 1 or last_header_byte[0] != 0x7D:
+                        return False
+
+            # Read the full header for JSON parsing
+            if 8 + header_size <= len(file_prefix.contents_header_bytes):
+                # Entire header is in the prefix
+                header_bytes = file_prefix.contents_header_bytes[8 : 8 + header_size]
+            else:
+                # Need to read full header from file
+                with open(file_prefix.filename, "rb") as f:
+                    f.seek(8)
+                    header_bytes = f.read(header_size)
+
+            if len(header_bytes) != header_size:
+                return False
+
+            # Parse the validated JSON header
+            header = json.loads(header_bytes.decode("utf-8"))
+            # check if header is a dict
+            if not isinstance(header, dict):
+                return False
+            # Basic validation: check if it looks like safetensors metadata
+            # Safetensors headers should have entries with data_offsets
+            has_valid_entries = False
+            for key, value in header.items():
+                if key == "__metadata__":  # Special metadata key
+                    continue
+                if isinstance(value, dict) and "data_offsets" in value:
+                    has_valid_entries = True
+                    break
+
+            return has_valid_entries
+
+        except Exception:
+            # Any exception during parsing means it's not a valid safetensors file
+            return False
diff --git a/lib/galaxy/datatypes/test/cellpose_model_safetensors.safetensors b/lib/galaxy/datatypes/test/cellpose_model_safetensors.safetensors