@@ -143,14 +143,16 @@ def create_anndata(
143143 Returns:
144144 ad.AnnData: The generated AnnData object containing the transcriptomics data and metadata.
145145 """
146- # df_filtered = filter_transcripts(df, min_qv=qv_threshold)
147- df_filtered = df
148- # metrics = compute_transcript_metrics(df_filtered, qv_threshold, cell_id_col)
149- df_filtered = df_filtered [ df_filtered [ cell_id_col ]. astype ( str ) != "-1" ]
146+ # Filter out unassigned cells
147+ df_filtered = df [ df [ cell_id_col ]. astype ( str ) != "UNASSIGNED" ]
148+
149+ # Create pivot table for gene expression counts per cell
150150 pivot_df = df_filtered .rename (columns = {cell_id_col : "cell" , "feature_name" : "gene" })[["cell" , "gene" ]].pivot_table (
151151 index = "cell" , columns = "gene" , aggfunc = "size" , fill_value = 0
152152 )
153153 pivot_df = pivot_df [pivot_df .sum (axis = 1 ) >= min_transcripts ]
154+
155+ # Summarize cell metrics
154156 cell_summary = []
155157 for cell_id , cell_data in df_filtered .groupby (cell_id_col ):
156158 if len (cell_data ) < min_transcripts :
@@ -159,49 +161,46 @@ def create_anndata(
159161 cell_area = cell_convex_hull .area
160162 if cell_area < min_cell_area or cell_area > max_cell_area :
161163 continue
162- # if 'nucleus_distance' in cell_data:
163- # nucleus_data = cell_data[cell_data['nucleus_distance'] == 0]
164- # else:
165- # nucleus_data = cell_data[cell_data['overlaps_nucleus'] == 1]
166- # if len(nucleus_data) >= 3:
167- # nucleus_convex_hull = ConvexHull(nucleus_data[['x_location', 'y_location']])
168- # else:
169- # nucleus_convex_hull = None
170164 cell_summary .append (
171165 {
172166 "cell" : cell_id ,
173167 "cell_centroid_x" : cell_data ["x_location" ].mean (),
174168 "cell_centroid_y" : cell_data ["y_location" ].mean (),
175169 "cell_area" : cell_area ,
176- # "nucleus_centroid_x": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(),
177- # "nucleus_centroid_y": nucleus_data['x_location'].mean() if len(nucleus_data) > 0 else cell_data['x_location'].mean(),
178- # "nucleus_area": nucleus_convex_hull.area if nucleus_convex_hull else 0,
179- # "percent_cytoplasmic": len(cell_data[cell_data['overlaps_nucleus'] != 1]) / len(cell_data) * 100,
180- # "has_nucleus": len(nucleus_data) > 0
181170 }
182171 )
183172 cell_summary = pd .DataFrame (cell_summary ).set_index ("cell" )
173+
174+ # Add genes from panel_df (if provided) to the pivot table
184175 if panel_df is not None :
185176 panel_df = panel_df .sort_values ("gene" )
186177 genes = panel_df ["gene" ].values
187178 for gene in genes :
188179 if gene not in pivot_df :
189180 pivot_df [gene ] = 0
190181 pivot_df = pivot_df [genes .tolist ()]
182+
183+ # Create var DataFrame
191184 if panel_df is None :
192185 var_df = pd .DataFrame (
193186 [
194- {"gene" : i , "feature_types" : "Gene Expression" , "genome" : "Unknown" }
195- for i in np .unique (pivot_df .columns .values )
187+ {"gene" : gene , "feature_types" : "Gene Expression" , "genome" : "Unknown" }
188+ for gene in np .unique (pivot_df .columns .values )
196189 ]
197190 ).set_index ("gene" )
198191 else :
199192 var_df = panel_df [["gene" , "ensembl" ]].rename (columns = {"ensembl" : "gene_ids" })
200193 var_df ["feature_types" ] = "Gene Expression"
201194 var_df ["genome" ] = "Unknown"
202195 var_df = var_df .set_index ("gene" )
203- # gene_metrics = metrics['gene_metrics'].set_index('feature_name')
204- # var_df = var_df.join(gene_metrics, how='left').fillna(0)
196+
197+ # Compute total assigned and unassigned transcript counts for each gene
198+ assigned_counts = df_filtered .groupby ("feature_name" )["feature_name" ].count ()
199+ unassigned_counts = df [df [cell_id_col ].astype (str ) == "UNASSIGNED" ].groupby ("feature_name" )["feature_name" ].count ()
200+ var_df ["total_assigned" ] = var_df .index .map (assigned_counts ).fillna (0 ).astype (int )
201+ var_df ["total_unassigned" ] = var_df .index .map (unassigned_counts ).fillna (0 ).astype (int )
202+
203+ # Filter cells and create the AnnData object
205204 cells = list (set (pivot_df .index ) & set (cell_summary .index ))
206205 pivot_df = pivot_df .loc [cells , :]
207206 cell_summary = cell_summary .loc [cells , :]
@@ -211,12 +210,7 @@ def create_anndata(
211210 adata .obs ["unique_transcripts" ] = (pivot_df > 0 ).sum (axis = 1 ).values
212211 adata .obs_names = pivot_df .index .values .tolist ()
213212 adata .obs = pd .merge (adata .obs , cell_summary .loc [adata .obs_names , :], left_index = True , right_index = True )
214- # adata.uns['metrics'] = {
215- # 'percent_assigned': metrics['percent_assigned'],
216- # 'percent_cytoplasmic': metrics['percent_cytoplasmic'],
217- # 'percent_nucleus': metrics['percent_nucleus'],
218- # 'percent_non_assigned_cytoplasmic': metrics['percent_non_assigned_cytoplasmic']
219- # }
213+
220214 return adata
221215
222216
0 commit comments