Skip to content

Commit 6d82d99

Browse files
Merge pull request #21018 from guerler/merge_25.0_into_dev_oct
Merge 25.0 into dev
2 parents 70d5ff6 + 7cf6bc5 commit 6d82d99

File tree

6 files changed

+168
-1
lines changed

6 files changed

+168
-1
lines changed

client/src/composables/zipExplorer.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ export function validateLocalZipFile(file?: File | null): string {
310310
}
311311

312312
export function isLocalZipFile(file?: File | null): boolean {
313-
return Boolean(file) && file?.type === "application/zip";
313+
return Boolean(file) && (file?.type === "application/zip" || file?.type === "application/x-zip-compressed");
314314
}
315315

316316
export async function isRemoteZipFile(url: string): Promise<boolean> {

lib/galaxy/config/sample/datatypes_conf.xml.sample

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@
150150
<datatype extension="vitessce.json" type="galaxy.datatypes.text:VitessceJson" mimetype="application/json" display_in_upload="True">
151151
<visualization plugin="vitessce" />
152152
</datatype>
153+
<datatype extension="auspice.json" type="galaxy.datatypes.text:AuspiceJson" mimetype="application/json" display_in_upload="True" />
153154
<datatype extension="data_manager_json" type="galaxy.datatypes.text:DataManagerJson" mimetype="application/json" subclass="true" display_in_upload="false"/>
154155
<datatype extension="dbn" type="galaxy.datatypes.sequence:DotBracket" display_in_upload="true" description="Dot-Bracket format is a text-based format for storing both an RNA sequence and its corresponding 2D structure." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Dbn"/>
155156
<datatype extension="fai" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true" subclass="true" description="A Fasta Index File is a text file consisting of lines each with five TAB-delimited columns : Name, Length, offset, linebases, Linewidth" description_url="http://www.htslib.org/doc/faidx.html"/>
@@ -1180,6 +1181,7 @@
11801181
<datatype extension="bcsp" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Binary format of k-mer hash table which is only compatible with Fairy"/>
11811182
<!-- rdeval types -->
11821183
<datatype extension="rd" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Rdeval read sketch"/>
1184+
<datatype extension="safetensors" type="galaxy.datatypes.binary:Safetensors" mimetype="application/octet-stream" display_in_upload="true" description="A simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy)" description_url="https://huggingface.co/docs/safetensors/index"/>
11831185
</registration>
11841186

11851187
<sniffers>
@@ -1395,6 +1397,7 @@
13951397
<sniffer type="galaxy.datatypes.text:CytoscapeJson"/>
13961398
<sniffer type="galaxy.datatypes.text:GeoJson"/>
13971399
<sniffer type="galaxy.datatypes.text:VitessceJson"/>
1400+
<snipper type="galaxy.datatypes.text:AuspiceJson"/>
13981401
<sniffer type="galaxy.datatypes.text:PithyaResult"/>
13991402
<sniffer type="galaxy.datatypes.text:BCSLts"/>
14001403
<sniffer type="galaxy.datatypes.text:Json"/>

lib/galaxy/datatypes/binary.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4872,3 +4872,102 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
48724872
with open(dataset.get_file_name(), "rb") as handle:
48734873
header_bytes = handle.read(8)
48744874
dataset.metadata.version = struct.unpack("<i", header_bytes[4:8])[0]
4875+
4876+
4877+
@build_sniff_from_prefix
4878+
class Safetensors(Binary):
4879+
"""
4880+
safetensors is a new simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy).
4881+
It provides a secure way to store and load tensors without the security risks associated with pickle-based formats.
4882+
Safetensors files consist of a JSON header followed by tensor data.
4883+
more info at: https://github.com/huggingface/safetensors
4884+
"""
4885+
4886+
file_ext = "safetensors"
4887+
4888+
def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
4889+
"""
4890+
Determining if the file is in safetensors format
4891+
>>> from galaxy.datatypes.sniff import get_test_fname
4892+
>>> fname = get_test_fname('cellpose_model_safetensors.safetensors')
4893+
>>> Safetensors().sniff(fname)
4894+
True
4895+
>>> fname = get_test_fname('test_charmm.vel')
4896+
>>> Safetensors().sniff(fname)
4897+
False
4898+
"""
4899+
try:
4900+
# Safetensors files start with an 8-byte little-endian integer
4901+
# indicating the size of the JSON header
4902+
if len(file_prefix.contents_header_bytes) < 8:
4903+
return False
4904+
4905+
header_size = int.from_bytes(file_prefix.contents_header_bytes[:8], "little")
4906+
4907+
# Currently, there's a limit on the size of the header of 100MB to prevent parsing extremely large JSON headers
4908+
# In practice, safetensors headers are typically just a few KB to MB
4909+
# (containing tensor names, shapes, dtypes, and offsets - rarely exceeds 1-10MB even for large models)
4910+
# But in theory it is possible to have 100 MB header
4911+
# more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#benefits
4912+
if header_size == 0 or header_size > 10**8: # 100MB max for JSON header
4913+
return False
4914+
4915+
# Check if file is large enough to contain the full header
4916+
if file_prefix.file_size < 8 + header_size:
4917+
return False
4918+
4919+
# CRITICAL: Check if header begins with '{' character (0x7B) as per safetensors spec
4920+
# This is required by the format and helps distinguish from other binary formats
4921+
# Only check 1 byte to avoid issues with malicious header_size values
4922+
# more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#format
4923+
if file_prefix.contents_header_bytes[8] != 0x7B:
4924+
return False
4925+
4926+
# Check if header ends with '}' character (0x7D) as per safetensors spec
4927+
# This requires reading more data if header extends beyond the prefix
4928+
header_end_pos = 8 + header_size - 1
4929+
if header_end_pos < len(file_prefix.contents_header_bytes):
4930+
# Header end is within the prefix
4931+
if file_prefix.contents_header_bytes[header_end_pos] != 0x7D:
4932+
return False
4933+
else:
4934+
# Header extends beyond prefix, need to check from file
4935+
with open(file_prefix.filename, "rb") as f:
4936+
f.seek(header_end_pos)
4937+
last_header_byte = f.read(1)
4938+
if len(last_header_byte) != 1 or last_header_byte[0] != 0x7D:
4939+
return False
4940+
4941+
# Read the full header for JSON parsing
4942+
if 8 + header_size <= len(file_prefix.contents_header_bytes):
4943+
# Entire header is in the prefix
4944+
header_bytes = file_prefix.contents_header_bytes[8 : 8 + header_size]
4945+
else:
4946+
# Need to read full header from file
4947+
with open(file_prefix.filename, "rb") as f:
4948+
f.seek(8)
4949+
header_bytes = f.read(header_size)
4950+
4951+
if len(header_bytes) != header_size:
4952+
return False
4953+
4954+
# Parse the validated JSON header
4955+
header = json.loads(header_bytes.decode("utf-8"))
4956+
# check if header is a dict
4957+
if not isinstance(header, dict):
4958+
return False
4959+
# Basic validation: check if it looks like safetensors metadata
4960+
# Safetensors headers should have entries with data_offsets
4961+
has_valid_entries = False
4962+
for key, value in header.items():
4963+
if key == "__metadata__": # Special metadata key
4964+
continue
4965+
if isinstance(value, dict) and "data_offsets" in value:
4966+
has_valid_entries = True
4967+
break
4968+
4969+
return has_valid_entries
4970+
4971+
except Exception:
4972+
# Any exception during parsing means it's not a valid safetensors file
4973+
return False
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"version": "v2",
3+
"meta": {
4+
"title": "Minimal AuspiceJSON",
5+
"updated": "2025-02-05",
6+
"panels": ["tree"]
7+
},
8+
"tree": {
9+
"name": "1",
10+
"node_attrs": {
11+
"div": 1
12+
}
13+
}
14+
}
76 Bytes
Binary file not shown.

lib/galaxy/datatypes/text.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,57 @@ def _looks_like_is_vitesscejson(self, file_prefix: FilePrefix, load_size: int =
738738
return False
739739

740740

741+
@build_sniff_from_prefix
742+
class AuspiceJson(Json):
743+
"""
744+
Auspice is a visualization tool for phylogenetic trees and associated data.
745+
It uses JSON format to represent the tree structure and metadata.
746+
"""
747+
748+
file_ext = "auspice.json"
749+
750+
def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
751+
super().set_peek(dataset)
752+
if not dataset.dataset.purged:
753+
dataset.blurb = "AuspiceJSON"
754+
755+
def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
756+
"""
757+
Determines whether the file is in Auspice v2 JSON by looking for keys
758+
like "version", "meta" and "updated" that are both required by the
759+
https://docs.nextstrain.org/projects/auspice/en/stable/releases/v2.html format
760+
and also will be in the first part of the file
761+
762+
>>> from galaxy.datatypes.sniff import get_test_fname
763+
>>> fname = get_test_fname( '1.json' )
764+
>>> AuspiceJson().sniff( fname )
765+
False
766+
>>> fname = get_test_fname( '1.auspicejson' )
767+
>>> AuspiceJson().sniff( fname )
768+
True
769+
"""
770+
is_auspicejson = False
771+
if self._looks_like_json(file_prefix):
772+
is_auspicejson = self._looks_like_is_auspicejson(file_prefix)
773+
return is_auspicejson
774+
775+
def _looks_like_is_auspicejson(self, file_prefix: FilePrefix, load_size: int = 20000) -> bool:
776+
"""
777+
Expects JSON to start with { and 'meta', 'tree', 'updated' and 'nodes' to be present as keys in the JSON structure.
778+
"""
779+
try:
780+
with open(file_prefix.filename) as fh:
781+
segment_str = fh.read(load_size)
782+
783+
if segment_str.startswith("{") and all(
784+
x in segment_str for x in ["version", "meta", "updated", "panels"]
785+
):
786+
return True
787+
except Exception:
788+
pass
789+
return False
790+
791+
741792
@build_sniff_from_prefix
742793
class Obo(Text):
743794
"""

0 commit comments

Comments
 (0)