Skip to content

Commit 9896e9b

Browse files
nilchiamvdbeek
authored andcommitted
add safetensors
remove safetensor dependency keeping only sniff update description test header and add more comments and info about data add test test files match the files in galaxy-test-data add test files Update lib/galaxy/config/sample/datatypes_conf.xml.sample Co-authored-by: M Bernt <[email protected]> remove subclass=true add Wolfgang's suggestion black reformater json check using same function for json and safetensors fix lint reformat fix isort lint Rename test dataset rename test file try to use FilePrefix add if for when the header is big fix lint Update lib/galaxy/datatypes/util/json.py Co-authored-by: Nicola Soranzo <[email protected]> Update lib/galaxy/datatypes/binary.py Co-authored-by: Nicola Soranzo <[email protected]> remove unused function Update lib/galaxy/datatypes/binary.py Co-authored-by: Nicola Soranzo <[email protected]> Restore text.py and remove json.py check if header is dict Update lib/galaxy/config/sample/datatypes_conf.xml.sample Co-authored-by: Nicola Soranzo <[email protected]> remove pth file Update lib/galaxy/datatypes/binary.py Co-authored-by: Nicola Soranzo <[email protected]> Update lib/galaxy/datatypes/binary.py Co-authored-by: Nicola Soranzo <[email protected]> Update lib/galaxy/datatypes/binary.py Co-authored-by: Nicola Soranzo <[email protected]> correct test file name
1 parent 4e111b5 commit 9896e9b

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed

lib/galaxy/config/sample/datatypes_conf.xml.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,7 @@
11721172
<datatype extension="bcsp" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Binary format of k-mer hash table which is only compatible with Fairy"/>
11731173
<!-- rdeval types -->
11741174
<datatype extension="rd" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Rdeval read sketch"/>
1175+
<datatype extension="safetensors" type="galaxy.datatypes.binary:Safetensors" mimetype="application/octet-stream" display_in_upload="true" description="A simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy)" description_url="https://huggingface.co/docs/safetensors/index"/>
11751176
</registration>
11761177

11771178
<sniffers>

lib/galaxy/datatypes/binary.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4847,3 +4847,102 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
48474847
with open(dataset.get_file_name(), "rb") as handle:
48484848
header_bytes = handle.read(8)
48494849
dataset.metadata.version = struct.unpack("<i", header_bytes[4:8])[0]
4850+
4851+
4852+
@build_sniff_from_prefix
4853+
class Safetensors(Binary):
4854+
"""
4855+
safetensors is a new simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy).
4856+
It provides a secure way to store and load tensors without the security risks associated with pickle-based formats.
4857+
Safetensors files consist of a JSON header followed by tensor data.
4858+
more info at: https://github.com/huggingface/safetensors
4859+
"""
4860+
4861+
file_ext = "safetensors"
4862+
4863+
def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
4864+
"""
4865+
Determining if the file is in safetensors format
4866+
>>> from galaxy.datatypes.sniff import get_test_fname
4867+
>>> fname = get_test_fname('cellpose_model_safetensors.safetensors')
4868+
>>> Safetensors().sniff(fname)
4869+
True
4870+
>>> fname = get_test_fname('test_charmm.vel')
4871+
>>> Safetensors().sniff(fname)
4872+
False
4873+
"""
4874+
try:
4875+
# Safetensors files start with an 8-byte little-endian integer
4876+
# indicating the size of the JSON header
4877+
if len(file_prefix.contents_header_bytes) < 8:
4878+
return False
4879+
4880+
header_size = int.from_bytes(file_prefix.contents_header_bytes[:8], "little")
4881+
4882+
# Currently, there's a limit on the size of the header of 100MB to prevent parsing extremely large JSON headers
4883+
# In practice, safetensors headers are typically just a few KB to MB
4884+
# (containing tensor names, shapes, dtypes, and offsets - rarely exceeds 1-10MB even for large models)
4885+
# But in theory it is possible to have 100 MB header
4886+
# more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#benefits
4887+
if header_size == 0 or header_size > 10**8: # 100MB max for JSON header
4888+
return False
4889+
4890+
# Check if file is large enough to contain the full header
4891+
if file_prefix.file_size < 8 + header_size:
4892+
return False
4893+
4894+
# CRITICAL: Check if header begins with '{' character (0x7B) as per safetensors spec
4895+
# This is required by the format and helps distinguish from other binary formats
4896+
# Only check 1 byte to avoid issues with malicious header_size values
4897+
# more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#format
4898+
if file_prefix.contents_header_bytes[8] != 0x7B:
4899+
return False
4900+
4901+
# Check if header ends with '}' character (0x7D) as per safetensors spec
4902+
# This requires reading more data if header extends beyond the prefix
4903+
header_end_pos = 8 + header_size - 1
4904+
if header_end_pos < len(file_prefix.contents_header_bytes):
4905+
# Header end is within the prefix
4906+
if file_prefix.contents_header_bytes[header_end_pos] != 0x7D:
4907+
return False
4908+
else:
4909+
# Header extends beyond prefix, need to check from file
4910+
with open(file_prefix.filename, "rb") as f:
4911+
f.seek(header_end_pos)
4912+
last_header_byte = f.read(1)
4913+
if len(last_header_byte) != 1 or last_header_byte[0] != 0x7D:
4914+
return False
4915+
4916+
# Read the full header for JSON parsing
4917+
if 8 + header_size <= len(file_prefix.contents_header_bytes):
4918+
# Entire header is in the prefix
4919+
header_bytes = file_prefix.contents_header_bytes[8 : 8 + header_size]
4920+
else:
4921+
# Need to read full header from file
4922+
with open(file_prefix.filename, "rb") as f:
4923+
f.seek(8)
4924+
header_bytes = f.read(header_size)
4925+
4926+
if len(header_bytes) != header_size:
4927+
return False
4928+
4929+
# Parse the validated JSON header
4930+
header = json.loads(header_bytes.decode("utf-8"))
4931+
# check if header is a dict
4932+
if not isinstance(header, dict):
4933+
return False
4934+
# Basic validation: check if it looks like safetensors metadata
4935+
# Safetensors headers should have entries with data_offsets
4936+
has_valid_entries = False
4937+
for key, value in header.items():
4938+
if key == "__metadata__": # Special metadata key
4939+
continue
4940+
if isinstance(value, dict) and "data_offsets" in value:
4941+
has_valid_entries = True
4942+
break
4943+
4944+
return has_valid_entries
4945+
4946+
except Exception:
4947+
# Any exception during parsing means it's not a valid safetensors file
4948+
return False
76 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)