Merge pull request #21018 from guerler/merge_25.0_into_dev_oct

ahmedhamidawan · web-flow · commit 6d82d99bb520 · 2025-10-07T11:33:02.000-05:00
Merge 25.0 into dev
diff --git a/client/src/composables/zipExplorer.ts b/client/src/composables/zipExplorer.ts
@@ -310,7 +310,7 @@ export function validateLocalZipFile(file?: File | null): string {
 }
 
 export function isLocalZipFile(file?: File | null): boolean {
-    return Boolean(file) && file?.type === "application/zip";
+    return Boolean(file) && (file?.type === "application/zip" || file?.type === "application/x-zip-compressed");
 }
 
 export async function isRemoteZipFile(url: string): Promise<boolean> {
diff --git a/lib/galaxy/config/sample/datatypes_conf.xml.sample b/lib/galaxy/config/sample/datatypes_conf.xml.sample
@@ -150,6 +150,7 @@
     <datatype extension="vitessce.json" type="galaxy.datatypes.text:VitessceJson" mimetype="application/json" display_in_upload="True">
       <visualization plugin="vitessce" />
     </datatype>
+    <datatype extension="auspice.json" type="galaxy.datatypes.text:AuspiceJson" mimetype="application/json" display_in_upload="True" />
     <datatype extension="data_manager_json" type="galaxy.datatypes.text:DataManagerJson" mimetype="application/json" subclass="true" display_in_upload="false"/>
     <datatype extension="dbn" type="galaxy.datatypes.sequence:DotBracket" display_in_upload="true" description="Dot-Bracket format is a text-based format for storing both an RNA sequence and its corresponding 2D structure." description_url="https://wiki.galaxyproject.org/Learn/Datatypes#Dbn"/>
     <datatype extension="fai" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true" subclass="true" description="A Fasta Index File is a text file consisting of lines each with five TAB-delimited columns : Name, Length, offset, linebases, Linewidth" description_url="http://www.htslib.org/doc/faidx.html"/>
@@ -1180,6 +1181,7 @@
     <datatype extension="bcsp" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Binary format of k-mer hash table which is only compatible with Fairy"/>
     <!-- rdeval types -->
     <datatype extension="rd" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Rdeval read sketch"/>
+    <datatype extension="safetensors" type="galaxy.datatypes.binary:Safetensors" mimetype="application/octet-stream" display_in_upload="true" description="A simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy)" description_url="https://huggingface.co/docs/safetensors/index"/>
   </registration>
 
   <sniffers>
@@ -1395,6 +1397,7 @@
     <sniffer type="galaxy.datatypes.text:CytoscapeJson"/>
     <sniffer type="galaxy.datatypes.text:GeoJson"/>
     <sniffer type="galaxy.datatypes.text:VitessceJson"/>
+    <snipper type="galaxy.datatypes.text:AuspiceJson"/>
     <sniffer type="galaxy.datatypes.text:PithyaResult"/>
     <sniffer type="galaxy.datatypes.text:BCSLts"/>
     <sniffer type="galaxy.datatypes.text:Json"/>
diff --git a/lib/galaxy/datatypes/binary.py b/lib/galaxy/datatypes/binary.py
@@ -4872,3 +4872,102 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
         with open(dataset.get_file_name(), "rb") as handle:
             header_bytes = handle.read(8)
         dataset.metadata.version = struct.unpack("<i", header_bytes[4:8])[0]
+
+
+@build_sniff_from_prefix
+class Safetensors(Binary):
+    """
+    safetensors is a new simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy).
+    It provides a secure way to store and load tensors without the security risks associated with pickle-based formats.
+    Safetensors files consist of a JSON header followed by tensor data.
+    more info at: https://github.com/huggingface/safetensors
+    """
+
+    file_ext = "safetensors"
+
+    def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
+        """
+        Determining if the file is in safetensors format
+        >>> from galaxy.datatypes.sniff import get_test_fname
+        >>> fname = get_test_fname('cellpose_model_safetensors.safetensors')
+        >>> Safetensors().sniff(fname)
+        True
+        >>> fname = get_test_fname('test_charmm.vel')
+        >>> Safetensors().sniff(fname)
+        False
+        """
+        try:
+            # Safetensors files start with an 8-byte little-endian integer
+            # indicating the size of the JSON header
+            if len(file_prefix.contents_header_bytes) < 8:
+                return False
+
+            header_size = int.from_bytes(file_prefix.contents_header_bytes[:8], "little")
+
+            # Currently, there's a limit on the size of the header of 100MB to prevent parsing extremely large JSON headers
+            # In practice, safetensors headers are typically just a few KB to MB
+            # (containing tensor names, shapes, dtypes, and offsets - rarely exceeds 1-10MB even for large models)
+            # But in theory it is possible to have 100 MB header
+            # more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#benefits
+            if header_size == 0 or header_size > 10**8:  # 100MB max for JSON header
+                return False
+
+            # Check if file is large enough to contain the full header
+            if file_prefix.file_size < 8 + header_size:
+                return False
+
+            # CRITICAL: Check if header begins with '{' character (0x7B) as per safetensors spec
+            # This is required by the format and helps distinguish from other binary formats
+            # Only check 1 byte to avoid issues with malicious header_size values
+            # more info here: https://github.com/huggingface/safetensors?tab=readme-ov-file#format
+            if file_prefix.contents_header_bytes[8] != 0x7B:
+                return False
+
+            # Check if header ends with '}' character (0x7D) as per safetensors spec
+            # This requires reading more data if header extends beyond the prefix
+            header_end_pos = 8 + header_size - 1
+            if header_end_pos < len(file_prefix.contents_header_bytes):
+                # Header end is within the prefix
+                if file_prefix.contents_header_bytes[header_end_pos] != 0x7D:
+                    return False
+            else:
+                # Header extends beyond prefix, need to check from file
+                with open(file_prefix.filename, "rb") as f:
+                    f.seek(header_end_pos)
+                    last_header_byte = f.read(1)
+                    if len(last_header_byte) != 1 or last_header_byte[0] != 0x7D:
+                        return False
+
+            # Read the full header for JSON parsing
+            if 8 + header_size <= len(file_prefix.contents_header_bytes):
+                # Entire header is in the prefix
+                header_bytes = file_prefix.contents_header_bytes[8 : 8 + header_size]
+            else:
+                # Need to read full header from file
+                with open(file_prefix.filename, "rb") as f:
+                    f.seek(8)
+                    header_bytes = f.read(header_size)
+
+            if len(header_bytes) != header_size:
+                return False
+
+            # Parse the validated JSON header
+            header = json.loads(header_bytes.decode("utf-8"))
+            # check if header is a dict
+            if not isinstance(header, dict):
+                return False
+            # Basic validation: check if it looks like safetensors metadata
+            # Safetensors headers should have entries with data_offsets
+            has_valid_entries = False
+            for key, value in header.items():
+                if key == "__metadata__":  # Special metadata key
+                    continue
+                if isinstance(value, dict) and "data_offsets" in value:
+                    has_valid_entries = True
+                    break
+
+            return has_valid_entries
+
+        except Exception:
+            # Any exception during parsing means it's not a valid safetensors file
+            return False
diff --git a/lib/galaxy/datatypes/test/1.auspicejson b/lib/galaxy/datatypes/test/1.auspicejson
@@ -0,0 +1,14 @@
+{
+  "version": "v2",
+  "meta": {
+    "title": "Minimal AuspiceJSON",
+    "updated": "2025-02-05",
+    "panels": ["tree"]
+  },
+  "tree": {
+    "name": "1",
+    "node_attrs": {
+      "div": 1
+    }
+  }
+}
diff --git a/lib/galaxy/datatypes/test/cellpose_model_safetensors.safetensors b/lib/galaxy/datatypes/test/cellpose_model_safetensors.safetensors
diff --git a/lib/galaxy/datatypes/text.py b/lib/galaxy/datatypes/text.py
@@ -738,6 +738,57 @@ def _looks_like_is_vitesscejson(self, file_prefix: FilePrefix, load_size: int =
         return False
 
 
+@build_sniff_from_prefix
+class AuspiceJson(Json):
+    """
+    Auspice is a visualization tool for phylogenetic trees and associated data.
+    It uses JSON format to represent the tree structure and metadata.
+    """
+
+    file_ext = "auspice.json"
+
+    def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
+        super().set_peek(dataset)
+        if not dataset.dataset.purged:
+            dataset.blurb = "AuspiceJSON"
+
+    def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
+        """
+        Determines whether the file is in Auspice v2 JSON by looking for keys
+        like "version", "meta" and "updated" that are both required by the
+        https://docs.nextstrain.org/projects/auspice/en/stable/releases/v2.html format
+        and also will be in the first part of the file
+
+        >>> from galaxy.datatypes.sniff import get_test_fname
+        >>> fname = get_test_fname( '1.json' )
+        >>> AuspiceJson().sniff( fname )
+        False
+        >>> fname = get_test_fname( '1.auspicejson' )
+        >>> AuspiceJson().sniff( fname )
+        True
+        """
+        is_auspicejson = False
+        if self._looks_like_json(file_prefix):
+            is_auspicejson = self._looks_like_is_auspicejson(file_prefix)
+        return is_auspicejson
+
+    def _looks_like_is_auspicejson(self, file_prefix: FilePrefix, load_size: int = 20000) -> bool:
+        """
+        Expects JSON to start with { and 'meta', 'tree', 'updated' and 'nodes' to be present as keys in the JSON structure.
+        """
+        try:
+            with open(file_prefix.filename) as fh:
+                segment_str = fh.read(load_size)
+
+                if segment_str.startswith("{") and all(
+                    x in segment_str for x in ["version", "meta", "updated", "panels"]
+                ):
+                    return True
+        except Exception:
+            pass
+        return False
+
+
 @build_sniff_from_prefix
 class Obo(Text):
     """

Original file line number	Diff line number	Diff line change
`@@ -310,7 +310,7 @@ export function validateLocalZipFile(file?: File \| null): string {`
`310`	`310`	`}`
`311`	`311`
`312`	`312`	`export function isLocalZipFile(file?: File \| null): boolean {`
`313`		`- return Boolean(file) && file?.type === "application/zip";`
	`313`	`+ return Boolean(file) && (file?.type === "application/zip" \|\| file?.type === "application/x-zip-compressed");`
`314`	`314`	`}`
`315`	`315`
`316`	`316`	`export async function isRemoteZipFile(url: string): Promise<boolean> {`