Skip to content

Commit eb14c70

Browse files
authored
Merge pull request #355 from jkmckenna/0.3.2
0.3.2
2 parents 79d1798 + 8cf46bc commit eb14c70

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1246
-269
lines changed

.github/workflows/ci.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: CI
22

33
on:
44
push:
5-
branches: ["main", "0.3.0"]
5+
branches: ["main"]
66
pull_request:
7-
branches: ["main", "0.3.0"]
7+
branches: ["main"]
88

99
concurrency:
1010
group: ${{ github.workflow }}-${{ github.ref }}
@@ -49,7 +49,7 @@ jobs:
4949
- name: Lint with ruff
5050
run: ruff check --output-format=github .
5151

52-
smoke:
52+
pytest:
5353
runs-on: ubuntu-latest
5454
strategy:
5555
fail-fast: false
@@ -74,9 +74,9 @@ jobs:
7474
- name: Install dependencies
7575
run: |
7676
python -m pip install --upgrade pip
77-
python -m pip install .[dev]
78-
- name: Run smoke tests
79-
run: pytest -m smoke -q
77+
python -m pip install .[dev,torch,plotting]
78+
- name: Run pytest
79+
run: pytest -m "smoke" -q
8080

8181
docs:
8282
runs-on: ubuntu-latest

AGENTS.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# AGENTS.md
22

3-
This file tells coding agents (including OpenAI Codex and Claude Code) how to work in this repo.
3+
This file tells coding agents (including OpenAI's Codex, Anthropic's Claude Code, and Google's Gemini) how to work in this repo.
44

5-
Coding agents can only read from AGENTS.md or Claude.md files.
6-
Agents can not edit AGENTS.md or Claude.md files.
5+
- For AGENTS.md or CLAUDE.md files:
6+
- Agents can read from these files.
7+
- Agents can never edit these files.
78

89
## Goals
910
- Make minimal, correct changes.
@@ -12,15 +13,17 @@ Agents can not edit AGENTS.md or Claude.md files.
1213
- Generate production grade, scalable code.
1314

1415
## Prompt interface
15-
- When asked about a problem or task, first describe the plan to handle the task.
16-
- Keep taking prompts until the plan is validated.
16+
- When asked about a problem or task, first read all files relevent to the task's scope.
17+
- Describe the problem given the context.
18+
- Formulate a plan to address the problem within scope.
19+
- Refine the plan with user input.
1720
- Implement code after being told to proceed.
1821

1922
## Repo orientation
2023
- Read existing patterns before inventing new ones.
2124
- Don’t refactor broadly unless asked.
22-
- If you’re unsure about intended behavior, look for tests/docs first.
23-
- If behavior is not clear after tests/docs, look at the Click commands section in this file.
25+
- If you’re unsure about intended behavior, look for tests or docs first.
26+
- If behavior is not clear after reading tests and docs, look at the Click commands section in this file.
2427
- Ignore all files in any directory named "archived".
2528
- User defined parameters exist within src/smftools/config.
2629
- Parameters are herited from default.yaml -> MODALITY.yaml -> user_defined_config.csv

src/smftools/cli/chimeric_adata.py

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,47 @@
1414
ZERO_HAMMING_DISTANCE_SPANS = "zero_hamming_distance_spans"
1515

1616

17+
def _max_positive_span_length(delta_row: "np.ndarray") -> int:
18+
"""Return the max contiguous run length where delta span values are > 0."""
19+
import numpy as np
20+
21+
values = np.asarray(delta_row)
22+
if values.ndim != 1 or values.size == 0:
23+
return 0
24+
25+
positive_mask = values > 0
26+
if not np.any(positive_mask):
27+
return 0
28+
29+
transitions = np.diff(positive_mask.astype(np.int8))
30+
starts = np.flatnonzero(transitions == 1) + 1
31+
ends = np.flatnonzero(transitions == -1) + 1
32+
33+
if positive_mask[0]:
34+
starts = np.r_[0, starts]
35+
if positive_mask[-1]:
36+
ends = np.r_[ends, positive_mask.size]
37+
38+
return int(np.max(ends - starts))
39+
40+
41+
def _compute_chimeric_by_mod_hamming_distance(
42+
delta_layer: "np.ndarray",
43+
span_threshold: int,
44+
) -> "np.ndarray":
45+
"""Flag reads with any delta-hamming span strictly larger than ``span_threshold``."""
46+
import numpy as np
47+
48+
delta_values = np.asarray(delta_layer)
49+
if delta_values.ndim != 2:
50+
raise ValueError("delta_layer must be a 2D array with shape (n_obs, n_vars).")
51+
52+
flags = np.zeros(delta_values.shape[0], dtype=bool)
53+
for obs_idx, row in enumerate(delta_values):
54+
flags[obs_idx] = _max_positive_span_length(row) > span_threshold
55+
return flags
56+
57+
1758
def _build_top_segments_obs_tuples(
1859
read_df: "pd.DataFrame",
1960
obs_names: "pd.Index",
@@ -295,7 +336,10 @@ def chimeric_adata_core(
295336
suffix = "_strand_FASTA_base"
296337
variant_seq1_label = seq1_col[: -len(suffix)] if seq1_col.endswith(suffix) else seq1_col
297338
variant_seq2_label = seq2_col[: -len(suffix)] if seq2_col.endswith(suffix) else seq2_col
298-
logger.info("Detected variant call layer '%s'; will overlay on span clustermaps.", variant_call_layer_name)
339+
logger.info(
340+
"Detected variant call layer '%s'; will overlay on span clustermaps.",
341+
variant_call_layer_name,
342+
)
299343

300344
# ============================================================
301345
# 1) Rolling NN distances + layer clustermaps
@@ -625,7 +669,9 @@ def chimeric_adata_core(
625669
_vc = adata[mask].layers[variant_call_layer_name]
626670
_vc = _vc.toarray() if hasattr(_vc, "toarray") else np.asarray(_vc)
627671
_variant_call_df = pd.DataFrame(
628-
_vc, index=adata[mask].obs_names.astype(str), columns=adata.var_names,
672+
_vc,
673+
index=adata[mask].obs_names.astype(str),
674+
columns=adata.var_names,
629675
)
630676

631677
subset = subset[:, site_mask].copy()
@@ -641,6 +687,9 @@ def chimeric_adata_core(
641687
variant_call_data=_variant_call_df,
642688
seq1_label=variant_seq1_label,
643689
seq2_label=variant_seq2_label,
690+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
691+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
692+
variant_marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
644693
title=title,
645694
save_name=out_png,
646695
)
@@ -1000,7 +1049,9 @@ def chimeric_adata_core(
10001049
_vc = adata[sample_mask].layers[variant_call_layer_name]
10011050
_vc = _vc.toarray() if hasattr(_vc, "toarray") else np.asarray(_vc)
10021051
_cross_variant_call_df = pd.DataFrame(
1003-
_vc, index=adata[sample_mask].obs_names.astype(str), columns=adata.var_names,
1052+
_vc,
1053+
index=adata[sample_mask].obs_names.astype(str),
1054+
columns=adata.var_names,
10041055
)
10051056

10061057
# --- Plots ---
@@ -1073,6 +1124,15 @@ def chimeric_adata_core(
10731124
variant_call_data=_cross_variant_call_df,
10741125
seq1_label=variant_seq1_label,
10751126
seq2_label=variant_seq2_label,
1127+
ref1_marker_color=getattr(
1128+
cfg, "variant_overlay_seq1_color", "white"
1129+
),
1130+
ref2_marker_color=getattr(
1131+
cfg, "variant_overlay_seq2_color", "black"
1132+
),
1133+
variant_marker_size=getattr(
1134+
cfg, "variant_overlay_marker_size", 4.0
1135+
),
10761136
title=title,
10771137
save_name=out_png,
10781138
)
@@ -1146,12 +1206,31 @@ def chimeric_adata_core(
11461206
)
11471207
delta_layer = np.clip(within_layer - cross_layer, 0, None)
11481208
adata.layers[DELTA_ZERO_HAMMING_DISTANCE_SPANS] = delta_layer
1209+
threshold = getattr(cfg, "delta_hamming_chimeric_span_threshold", 200)
1210+
try:
1211+
threshold = int(threshold)
1212+
except (TypeError, ValueError):
1213+
logger.warning(
1214+
"Invalid delta_hamming_chimeric_span_threshold=%s; using default 200.",
1215+
threshold,
1216+
)
1217+
threshold = 200
1218+
if threshold < 0:
1219+
logger.warning(
1220+
"delta_hamming_chimeric_span_threshold=%s is negative; clamping to 0.",
1221+
threshold,
1222+
)
1223+
threshold = 0
1224+
adata.obs["chimeric_by_mod_hamming_distance"] = (
1225+
_compute_chimeric_by_mod_hamming_distance(delta_layer, threshold)
1226+
)
11491227
else:
11501228
logger.warning(
11511229
"Cannot compute delta: missing %s or %s layer.",
11521230
ZERO_HAMMING_DISTANCE_SPANS,
11531231
CROSS_SAMPLE_ZERO_HAMMING_DISTANCE_SPANS,
11541232
)
1233+
adata.obs["chimeric_by_mod_hamming_distance"] = False
11551234

11561235
if DELTA_ZERO_HAMMING_DISTANCE_SPANS in adata.layers:
11571236
for reference in references:
@@ -1300,7 +1379,9 @@ def chimeric_adata_core(
13001379
.astype("category")
13011380
.cat.categories.tolist()
13021381
)
1303-
references = adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
1382+
references = (
1383+
adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
1384+
)
13041385

13051386
for reference in references:
13061387
ref_mask = adata.obs[cfg.reference_column] == reference
@@ -1332,9 +1413,7 @@ def chimeric_adata_core(
13321413
safe_sample = str(sample).replace(os.sep, "_")
13331414
safe_ref = str(reference).replace(os.sep, "_")
13341415
n_reads = int(sample_mask.sum())
1335-
trio_title = (
1336-
f"{sample} {reference} (n={n_reads})"
1337-
)
1416+
trio_title = f"{sample} {reference} (n={n_reads})"
13381417
out_png = span_trio_dir / f"{safe_sample}__{safe_ref}.png"
13391418
try:
13401419
plot_hamming_span_trio(
@@ -1345,6 +1424,15 @@ def chimeric_adata_core(
13451424
variant_call_data=_variant_call_df,
13461425
seq1_label=variant_seq1_label,
13471426
seq2_label=variant_seq2_label,
1427+
ref1_marker_color=getattr(
1428+
cfg, "variant_overlay_seq1_color", "white"
1429+
),
1430+
ref2_marker_color=getattr(
1431+
cfg, "variant_overlay_seq2_color", "black"
1432+
),
1433+
variant_marker_size=getattr(
1434+
cfg, "variant_overlay_marker_size", 4.0
1435+
),
13481436
title=trio_title,
13491437
save_name=out_png,
13501438
)

src/smftools/cli/hmm_adata.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,10 @@ def hmm_adata_core(
10891089
deaminase=deaminase,
10901090
min_signal=0,
10911091
index_col_suffix=cfg.reindexed_var_suffix,
1092+
overlay_variant_calls=getattr(cfg, "overlay_variant_calls", False),
1093+
variant_overlay_seq1_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
1094+
variant_overlay_seq2_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
1095+
variant_overlay_marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
10921096
)
10931097

10941098
hmm_length_dir = hmm_directory / "12b_hmm_length_clustermaps"
@@ -1145,6 +1149,10 @@ def hmm_adata_core(
11451149
min_signal=0,
11461150
index_col_suffix=cfg.reindexed_var_suffix,
11471151
length_feature_ranges=length_feature_ranges,
1152+
overlay_variant_calls=getattr(cfg, "overlay_variant_calls", False),
1153+
variant_overlay_seq1_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
1154+
variant_overlay_seq2_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
1155+
variant_overlay_marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
11481156
)
11491157

11501158
hmm_dir = hmm_directory / "13_hmm_bulk_traces"

src/smftools/cli/variant_adata.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,13 +221,18 @@ def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
221221
prefix = f"{chrom}_"
222222
end = f"_{strand}_{strand}{suffix}"
223223
candidates = [
224-
c for c in var_columns
224+
c
225+
for c in var_columns
225226
if c.startswith(prefix) and c.endswith(end) and c != unconverted_col
226227
]
227228
if len(candidates) == 1:
228229
return candidates[0]
229230
if len(candidates) > 1:
230-
logger.info("Multiple converted column candidates for '%s': %s", unconverted_col, candidates)
231+
logger.info(
232+
"Multiple converted column candidates for '%s': %s",
233+
unconverted_col,
234+
candidates,
235+
)
231236
return candidates[0]
232237
break
233238
return None
@@ -252,6 +257,7 @@ def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
252257
seq1_column=seq1_col,
253258
seq2_column=seq2_col,
254259
read_span_layer=cfg.mismatch_frequency_read_span_layer,
260+
reference_col=cfg.reference_column,
255261
)
256262

257263
############################################### Plot mismatch base frequencies ###############################################
@@ -365,7 +371,38 @@ def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
365371
variant_segment_layer=segment_layer_name,
366372
read_span_layer=cfg.mismatch_frequency_read_span_layer,
367373
save_path=segment_dir,
374+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
375+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
376+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
377+
show_position_axis=True,
378+
)
379+
380+
segment_type_dir = (
381+
variant_directory
382+
/ "deduplicated"
383+
/ "05_variant_segment_clustermaps_with_mismatch_type"
384+
)
385+
if segment_type_dir.exists():
386+
logger.info(
387+
"Variant segment mismatch-type clustermaps already exist at %s; skipping.",
388+
segment_type_dir,
389+
)
390+
else:
391+
make_dirs([segment_type_dir])
392+
plot_variant_segment_clustermaps(
393+
adata,
394+
seq1_column=seq1_col,
395+
seq2_column=seq2_col,
396+
sample_col=cfg.sample_name_col_for_plotting,
397+
reference_col=cfg.reference_column,
398+
variant_segment_layer=segment_layer_name,
399+
read_span_layer=cfg.mismatch_frequency_read_span_layer,
400+
save_path=segment_type_dir,
401+
ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
402+
ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
403+
marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
368404
show_position_axis=True,
405+
mismatch_type_obs_col="chimeric_variant_sites_type",
369406
)
370407

371408
# ============================================================

src/smftools/config/default.yaml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@ clustermap_cmap_cpg: "coolwarm"
188188
clustermap_cmap_a: "coolwarm"
189189
spatial_clustermap_sortby: "gpc"
190190

191+
# Clustermap variant params
192+
overlay_variant_calls: false
193+
variant_overlay_seq1_color: "black"
194+
variant_overlay_seq2_color: "white"
195+
variant_overlay_marker_size: 4.0
196+
191197
# Spatial analysis - Rolling NN Hamming
192198
rolling_nn_layer: "nan0_0minus1"
193199
rolling_nn_plot_layer: "nan0_0minus1"
@@ -225,6 +231,7 @@ rolling_nn_zero_pairs_segment_histogram_bins: 30
225231
cross_sample_analysis: true
226232
cross_sample_grouping_col: null
227233
cross_sample_random_seed: 42
234+
delta_hamming_chimeric_span_threshold: 200
228235

229236
# Latent Analysis - UMAP/Leiden params
230237
layer_for_umap_plotting: 'nan_half'
@@ -316,21 +323,13 @@ hmm_merge_layer_features:
316323
- ["all_accessible_features", 60]
317324
clustermap_cmap_hmm: "coolwarm"
318325
hmm_clustermap_feature_layers:
319-
- all_accessible_features
320326
- all_accessible_features_merged
321-
- small_accessible_patch
322-
- mid_accessible_patch
323-
- large_accessible_patch
324-
- large_accessible_patch_merged
325-
- nucleosome_depleted_region
326327
- nucleosome_depleted_region_merged
327328
- small_bound_stretch
328329
- medium_bound_stretch
329330
- putative_nucleosome
330-
- large_bound_stretch
331331
- all_footprint_features
332332
hmm_clustermap_length_layers:
333-
- all_accessible_features
334333
- all_accessible_features_merged
335334
- all_footprint_features
336335
hmm_clustermap_sortby: "hmm"

0 commit comments

Comments
 (0)