Skip to content

Commit d55a988

Browse files
committed
hashing removed
1 parent 28a5ab2 commit d55a988

File tree

2 files changed

+4
-77
lines changed

2 files changed

+4
-77
lines changed

notebooks/notebook.ipynb

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@
207207
"\n",
208208
"# Normalize counts to 1e4 per cell and log-transform for export\n",
209209
"sc.pp.normalize_total(adata_complete, target_sum=1e4)\n",
210-
"sc.pp.log1p(adata_complete)\n"
210+
"sc.pp.log1p(adata_complete)"
211211
]
212212
},
213213
{
@@ -273,8 +273,7 @@
273273
"⚠ Skipping points.bin.gz: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/points.bin.gz already exists (use force=True to overwrite)\n",
274274
"⚠ Skipping obs manifest: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/obs_manifest.json already exists (use force=True to overwrite)\n",
275275
"⚠ Skipping var manifest: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/var_manifest.json already exists (use force=True to overwrite)\n",
276-
"⚠ Skipping connectivity manifest: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/connectivity_manifest.json already exists (use force=True to overwrite)\n",
277-
"✓ Wrote dataset signature (hash=abeda2e0656c...) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/dataset_hash.json\n"
276+
"⚠ Skipping connectivity manifest: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/connectivity_manifest.json already exists (use force=True to overwrite)\n"
278277
]
279278
}
280279
],
@@ -311,7 +310,7 @@
311310
},
312311
{
313312
"cell_type": "code",
314-
"execution_count": 10,
313+
"execution_count": 11,
315314
"id": "a19552d8",
316315
"metadata": {},
317316
"outputs": [
@@ -335,7 +334,7 @@
335334
" 'var_gene_id_column': 'converted_id'}}}"
336335
]
337336
},
338-
"execution_count": 10,
337+
"execution_count": 11,
339338
"metadata": {},
340339
"output_type": "execute_result"
341340
}

src/cellucid/prepare_data.py

Lines changed: 0 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,9 @@
1717
- connectivities: sparse matrix with KNN connectivities
1818
"""
1919
import gzip
20-
import hashlib
2120
import json
2221
import re
2322
import tqdm
24-
from datetime import datetime
2523
from pathlib import Path
2624
from typing import Optional, Sequence, Union, Literal
2725

@@ -45,17 +43,6 @@ def _safe_filename_component(name: str) -> str:
4543
return safe or "field"
4644

4745

48-
def _sha256_json(obj: object) -> str:
49-
"""Stable sha256 over a JSON-serializable object."""
50-
normalized = json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
51-
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
52-
53-
54-
def _sha256_list_str(items: Sequence[str]) -> str:
55-
"""Convenience hash for lists of strings."""
56-
return _sha256_json([str(x) for x in items])
57-
58-
5946
def _to_dense(arr: Union[np.ndarray, sparse.spmatrix]) -> np.ndarray:
6047
"""Convert sparse matrix to dense numpy array if necessary."""
6148
if sparse.issparse(arr):
@@ -561,7 +548,6 @@ def export_data_for_web(
561548
"key": str(key),
562549
"kind": "category",
563550
"category_count": n_categories,
564-
"categories_hash": _sha256_list_str(categories),
565551
"codes_dtype": dtype_str,
566552
"outlier_quantized": obs_continuous_quantization is not None,
567553
"outlier_quantization_bits": int(obs_continuous_quantization)
@@ -802,7 +788,6 @@ def export_data_for_web(
802788

803789
# Process gene expression if provided
804790
genes_to_export: list[str] = []
805-
gene_ids_hash: Optional[str] = None
806791
if gene_expression is not None:
807792
if var is None:
808793
raise ValueError("var DataFrame must be provided when gene_expression is given.")
@@ -846,8 +831,6 @@ def export_data_for_web(
846831
print(f"⚠ Warning: {len(missing_genes)} gene identifiers not found in var: {missing_genes[:5]}...")
847832
genes_to_export = [g for g in genes_to_export if g in gene_id_to_idx]
848833

849-
gene_ids_hash = _sha256_list_str(genes_to_export)
850-
851834
var_manifest_path = out_dir / var_manifest_filename
852835
if _file_exists_skip(var_manifest_path, "var manifest", force):
853836
pass
@@ -1110,58 +1093,3 @@ def export_data_for_web(
11101093
)
11111094
else:
11121095
print("INFO: No connectivity data provided, skipping connectivity export.")
1113-
1114-
# --- Dataset signature for clash detection ---------------------------------
1115-
obs_fields_hash = _sha256_json(obs_field_summaries)
1116-
gene_count = len(genes_to_export) if genes_to_export else 0
1117-
1118-
hash_inputs = {
1119-
"version": 1,
1120-
"manifest_format": MANIFEST_FORMAT_VERSION,
1121-
"n_cells": int(n_cells),
1122-
"latent_dims": int(latent.shape[1]),
1123-
"obs_keys": [str(k) for k in obs_keys],
1124-
"obs_fields_hash": obs_fields_hash,
1125-
"obs_fields": obs_field_summaries,
1126-
"obs_continuous_quantization": obs_continuous_quantization,
1127-
"obs_categorical_dtype": obs_categorical_dtype,
1128-
"var_present": gene_expression is not None,
1129-
"gene_count": gene_count,
1130-
"gene_list_hash": gene_ids_hash,
1131-
"var_gene_id_column": var_gene_id_column if gene_expression is not None else None,
1132-
"var_quantization": var_quantization,
1133-
"connectivity": connectivity_meta,
1134-
"compression": compression if compression else None,
1135-
"points_filename": points_filename + (".gz" if compression else ""),
1136-
"obs_manifest": obs_manifest_filename,
1137-
"var_manifest": var_manifest_filename if gene_expression is not None else None,
1138-
"connectivity_manifest": connectivity_manifest_filename if connectivities is not None else None,
1139-
"centroid_outlier_quantile": centroid_outlier_quantile if centroid_outlier_quantile is not None else None,
1140-
"centroid_min_points": centroid_min_points,
1141-
}
1142-
1143-
signature = _sha256_json(hash_inputs)
1144-
dataset_hash_payload = {
1145-
"version": 1,
1146-
"created_at": datetime.utcnow().isoformat() + "Z",
1147-
"signature": signature,
1148-
"summary": {
1149-
"n_cells": int(n_cells),
1150-
"obs_field_count": len(obs_field_summaries),
1151-
"obs_keys": [str(k) for k in obs_keys],
1152-
"obs_fields_hash": obs_fields_hash,
1153-
"gene_count": gene_count,
1154-
"gene_list_hash": gene_ids_hash,
1155-
"compression": compression if compression else None,
1156-
"connectivity_edges": connectivity_meta.get("n_edges"),
1157-
},
1158-
"details": {
1159-
"hash_inputs": hash_inputs,
1160-
"gene_sample": genes_to_export[:10] if genes_to_export else [],
1161-
"notes": "signature is sha256 over hash_inputs; compare two dataset_hash.json files to diagnose clashes",
1162-
},
1163-
}
1164-
1165-
dataset_hash_path = out_dir / "dataset_hash.json"
1166-
dataset_hash_path.write_text(json.dumps(dataset_hash_payload, ensure_ascii=True, indent=2), encoding="utf-8")
1167-
print(f"✓ Wrote dataset signature (hash={signature[:12]}...) to {dataset_hash_path}")

0 commit comments

Comments
 (0)