Skip to content

Commit 780f0b0

Browse files
authored
adapted to UNASSIGNED.
1 parent 462d239 commit 780f0b0

File tree

1 file changed

+21
-27
lines changed

1 file changed

+21
-27
lines changed

src/segger/data/utils.py

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,16 @@ def create_anndata(
143143
Returns:
144144
ad.AnnData: The generated AnnData object containing the transcriptomics data and metadata.
145145
"""
146-
# df_filtered = filter_transcripts(df, min_qv=qv_threshold)
147-
df_filtered = df
148-
# metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col)
149-
df_filtered = df_filtered[df_filtered[cell_id_col].astype(str) != "-1"]
146+
# Filter out unassigned cells
147+
df_filtered = df[df[cell_id_col].astype(str) != "UNASSIGNED"]
148+
149+
# Create pivot table for gene expression counts per cell
150150
pivot_df = df_filtered.rename(columns={cell_id_col: "cell", "feature_name": "gene"})[["cell", "gene"]].pivot_table(
151151
index="cell", columns="gene", aggfunc="size", fill_value=0
152152
)
153153
pivot_df = pivot_df[pivot_df.sum(axis=1) >= min_transcripts]
154+
155+
# Summarize cell metrics
154156
cell_summary = []
155157
for cell_id, cell_data in df_filtered.groupby(cell_id_col):
156158
if len(cell_data) < min_transcripts:
@@ -159,49 +161,46 @@ def create_anndata(
159161
cell_area = cell_convex_hull.area
160162
if cell_area < min_cell_area or cell_area > max_cell_area:
161163
continue
162-
# if 'nucleus_distance' in cell_data:
163-
# nucleus_data = cell_data[cell_data['nucleus_distance'] == 0]
164-
# else:
165-
# nucleus_data = cell_data[cell_data['overlaps_nucleus'] == 1]
166-
# if len(nucleus_data) >= 3:
167-
# nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']])
168-
# else:
169-
# nucleus_convex_hull = None
170164
cell_summary.append(
171165
{
172166
"cell": cell_id,
173167
"cell_centroid_x": cell_data["x_location"].mean(),
174168
"cell_centroid_y": cell_data["y_location"].mean(),
175169
"cell_area": cell_area,
176-
# "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(),
177-
# "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(),
178-
# "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0,
179-
# "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100,
180-
# "has_nucleus": len(nucleus_data) > 0
181170
}
182171
)
183172
cell_summary = pd.DataFrame(cell_summary).set_index("cell")
173+
174+
# Add genes from panel_df (if provided) to the pivot table
184175
if panel_df is not None:
185176
panel_df = panel_df.sort_values("gene")
186177
genes = panel_df["gene"].values
187178
for gene in genes:
188179
if gene not in pivot_df:
189180
pivot_df[gene] = 0
190181
pivot_df = pivot_df[genes.tolist()]
182+
183+
# Create var DataFrame
191184
if panel_df is None:
192185
var_df = pd.DataFrame(
193186
[
194-
{"gene": i, "feature_types": "Gene Expression", "genome": "Unknown"}
195-
for i in np.unique(pivot_df.columns.values)
187+
{"gene": gene, "feature_types": "Gene Expression", "genome": "Unknown"}
188+
for gene in np.unique(pivot_df.columns.values)
196189
]
197190
).set_index("gene")
198191
else:
199192
var_df = panel_df[["gene", "ensembl"]].rename(columns={"ensembl": "gene_ids"})
200193
var_df["feature_types"] = "Gene Expression"
201194
var_df["genome"] = "Unknown"
202195
var_df = var_df.set_index("gene")
203-
# gene_metrics = metrics['gene_metrics'].set_index('feature_name')
204-
# var_df = var_df.join(gene_metrics, how='left').fillna(0)
196+
197+
# Compute total assigned and unassigned transcript counts for each gene
198+
assigned_counts = df_filtered.groupby("feature_name")["feature_name"].count()
199+
unassigned_counts = df[df[cell_id_col].astype(str) == "UNASSIGNED"].groupby("feature_name")["feature_name"].count()
200+
var_df["total_assigned"] = var_df.index.map(assigned_counts).fillna(0).astype(int)
201+
var_df["total_unassigned"] = var_df.index.map(unassigned_counts).fillna(0).astype(int)
202+
203+
# Filter cells and create the AnnData object
205204
cells = list(set(pivot_df.index) & set(cell_summary.index))
206205
pivot_df = pivot_df.loc[cells, :]
207206
cell_summary = cell_summary.loc[cells, :]
@@ -211,12 +210,7 @@ def create_anndata(
211210
adata.obs["unique_transcripts"] = (pivot_df > 0).sum(axis=1).values
212211
adata.obs_names = pivot_df.index.values.tolist()
213212
adata.obs = pd.merge(adata.obs, cell_summary.loc[adata.obs_names, :], left_index=True, right_index=True)
214-
# adata.uns['metrics'] = {
215-
# 'percent_assigned': metrics['percent_assigned'],
216-
# 'percent_cytoplasmic': metrics['percent_cytoplasmic'],
217-
# 'percent_nucleus': metrics['percent_nucleus'],
218-
# 'percent_non_assigned_cytoplasmic': metrics['percent_non_assigned_cytoplasmic']
219-
# }
213+
220214
return adata
221215

222216

0 commit comments

Comments
 (0)