1717- connectivities: sparse matrix with KNN connectivities
1818"""
1919import gzip
20- import hashlib
2120import json
2221import re
2322import tqdm
24- from datetime import datetime
2523from pathlib import Path
2624from typing import Optional , Sequence , Union , Literal
2725
@@ -45,17 +43,6 @@ def _safe_filename_component(name: str) -> str:
4543 return safe or "field"
4644
4745
48- def _sha256_json (obj : object ) -> str :
49- """Stable sha256 over a JSON-serializable object."""
50- normalized = json .dumps (obj , sort_keys = True , separators = ("," , ":" ), ensure_ascii = True )
51- return hashlib .sha256 (normalized .encode ("utf-8" )).hexdigest ()
52-
53-
54- def _sha256_list_str (items : Sequence [str ]) -> str :
55- """Convenience hash for lists of strings."""
56- return _sha256_json ([str (x ) for x in items ])
57-
58-
5946def _to_dense (arr : Union [np .ndarray , sparse .spmatrix ]) -> np .ndarray :
6047 """Convert sparse matrix to dense numpy array if necessary."""
6148 if sparse .issparse (arr ):
@@ -561,7 +548,6 @@ def export_data_for_web(
561548 "key" : str (key ),
562549 "kind" : "category" ,
563550 "category_count" : n_categories ,
564- "categories_hash" : _sha256_list_str (categories ),
565551 "codes_dtype" : dtype_str ,
566552 "outlier_quantized" : obs_continuous_quantization is not None ,
567553 "outlier_quantization_bits" : int (obs_continuous_quantization )
@@ -802,7 +788,6 @@ def export_data_for_web(
802788
803789 # Process gene expression if provided
804790 genes_to_export : list [str ] = []
805- gene_ids_hash : Optional [str ] = None
806791 if gene_expression is not None :
807792 if var is None :
808793 raise ValueError ("var DataFrame must be provided when gene_expression is given." )
@@ -846,8 +831,6 @@ def export_data_for_web(
846831 print (f"⚠ Warning: { len (missing_genes )} gene identifiers not found in var: { missing_genes [:5 ]} ..." )
847832 genes_to_export = [g for g in genes_to_export if g in gene_id_to_idx ]
848833
849- gene_ids_hash = _sha256_list_str (genes_to_export )
850-
851834 var_manifest_path = out_dir / var_manifest_filename
852835 if _file_exists_skip (var_manifest_path , "var manifest" , force ):
853836 pass
@@ -1110,58 +1093,3 @@ def export_data_for_web(
11101093 )
11111094 else :
11121095 print ("INFO: No connectivity data provided, skipping connectivity export." )
1113-
1114- # --- Dataset signature for clash detection ---------------------------------
1115- obs_fields_hash = _sha256_json (obs_field_summaries )
1116- gene_count = len (genes_to_export ) if genes_to_export else 0
1117-
1118- hash_inputs = {
1119- "version" : 1 ,
1120- "manifest_format" : MANIFEST_FORMAT_VERSION ,
1121- "n_cells" : int (n_cells ),
1122- "latent_dims" : int (latent .shape [1 ]),
1123- "obs_keys" : [str (k ) for k in obs_keys ],
1124- "obs_fields_hash" : obs_fields_hash ,
1125- "obs_fields" : obs_field_summaries ,
1126- "obs_continuous_quantization" : obs_continuous_quantization ,
1127- "obs_categorical_dtype" : obs_categorical_dtype ,
1128- "var_present" : gene_expression is not None ,
1129- "gene_count" : gene_count ,
1130- "gene_list_hash" : gene_ids_hash ,
1131- "var_gene_id_column" : var_gene_id_column if gene_expression is not None else None ,
1132- "var_quantization" : var_quantization ,
1133- "connectivity" : connectivity_meta ,
1134- "compression" : compression if compression else None ,
1135- "points_filename" : points_filename + (".gz" if compression else "" ),
1136- "obs_manifest" : obs_manifest_filename ,
1137- "var_manifest" : var_manifest_filename if gene_expression is not None else None ,
1138- "connectivity_manifest" : connectivity_manifest_filename if connectivities is not None else None ,
1139- "centroid_outlier_quantile" : centroid_outlier_quantile if centroid_outlier_quantile is not None else None ,
1140- "centroid_min_points" : centroid_min_points ,
1141- }
1142-
1143- signature = _sha256_json (hash_inputs )
1144- dataset_hash_payload = {
1145- "version" : 1 ,
1146- "created_at" : datetime .utcnow ().isoformat () + "Z" ,
1147- "signature" : signature ,
1148- "summary" : {
1149- "n_cells" : int (n_cells ),
1150- "obs_field_count" : len (obs_field_summaries ),
1151- "obs_keys" : [str (k ) for k in obs_keys ],
1152- "obs_fields_hash" : obs_fields_hash ,
1153- "gene_count" : gene_count ,
1154- "gene_list_hash" : gene_ids_hash ,
1155- "compression" : compression if compression else None ,
1156- "connectivity_edges" : connectivity_meta .get ("n_edges" ),
1157- },
1158- "details" : {
1159- "hash_inputs" : hash_inputs ,
1160- "gene_sample" : genes_to_export [:10 ] if genes_to_export else [],
1161- "notes" : "signature is sha256 over hash_inputs; compare two dataset_hash.json files to diagnose clashes" ,
1162- },
1163- }
1164-
1165- dataset_hash_path = out_dir / "dataset_hash.json"
1166- dataset_hash_path .write_text (json .dumps (dataset_hash_payload , ensure_ascii = True , indent = 2 ), encoding = "utf-8" )
1167- print (f"✓ Wrote dataset signature (hash={ signature [:12 ]} ...) to { dataset_hash_path } " )
0 commit comments