Skip to content

Commit add555f

Browse files
committed
add shuji suzuki method
1 parent 11beedc commit add555f

File tree

2 files changed

+319
-0
lines changed

2 files changed

+319
-0
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
__merge__: ../../api/comp_method.yaml
2+
name: neurips2022_shuji_suzuki
3+
4+
label: "Shuji Suzuki"
5+
summary: "1st place solution from NeurIPS 2022 multimodal competition by Shuji Suzuki"
6+
description: |
7+
This is the winning solution from the NeurIPS 2022 multimodal single-cell
8+
integration competition by Shuji Suzuki. The method uses neural networks
9+
with advanced preprocessing techniques including:
10+
11+
- tSVD-based imputation for multiome data
12+
- Gene selection based on correlation and pathway information (Reactome)
13+
- Ensemble modeling with multiple models trained on different seeds and batches
14+
- Weighted average of predictions from different models
15+
16+
For CITEseq: Uses correlation-based gene selection and pathway information
17+
For Multiome: Uses tSVD-based imputation and advanced preprocessing
18+
info:
19+
repository_url: https://github.com/shu65/open-problems-multimodal/tree/main
20+
21+
resources:
22+
- type: python_script
23+
path: script.py
24+
25+
engines:
26+
- type: docker
27+
image: openproblems/base_pytorch_nvidia:1
28+
setup:
29+
- type: python
30+
pypi:
31+
- optuna
32+
33+
runners:
34+
- type: executable
35+
- type: nextflow
36+
directives:
37+
label: [highmem, veryhightime, midcpu, highsharedmem]
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
#!/usr/bin/env python3
2+
3+
import anndata as ad
4+
import pandas as pd
5+
import numpy as np
6+
import scanpy as sc
7+
import torch
8+
import torch.nn as nn
9+
import torch.optim as optim
10+
from sklearn.model_selection import KFold
11+
from sklearn.decomposition import TruncatedSVD
12+
from sklearn.preprocessing import StandardScaler
13+
from sklearn.feature_selection import SelectKBest, f_regression
14+
from scipy.sparse import csr_matrix
15+
import warnings
16+
warnings.filterwarnings('ignore')
17+
18+
## VIASH START
19+
par = {
20+
'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad',
21+
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad',
22+
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad',
23+
'output': 'output.h5ad'
24+
}
25+
meta = {'resources_dir': '.'}
26+
## VIASH END
27+
28+
class MLPModel(nn.Module):
29+
"""Neural network model inspired by Shuji Suzuki's approach"""
30+
def __init__(self, input_dim, output_dim, hidden_dims=[512, 256, 128], dropout=0.2):
31+
super(MLPModel, self).__init__()
32+
33+
layers = []
34+
prev_dim = input_dim
35+
36+
for hidden_dim in hidden_dims:
37+
layers.extend([
38+
nn.Linear(prev_dim, hidden_dim),
39+
nn.BatchNorm1d(hidden_dim),
40+
nn.ReLU(),
41+
nn.Dropout(dropout)
42+
])
43+
prev_dim = hidden_dim
44+
45+
layers.append(nn.Linear(prev_dim, output_dim))
46+
self.model = nn.Sequential(*layers)
47+
48+
def forward(self, x):
49+
return self.model(x)
50+
51+
def preprocess_data(adata_mod1, adata_mod2=None, is_test=False):
52+
"""Preprocessing inspired by Shuji Suzuki's method"""
53+
print(f"Preprocessing data - shape: {adata_mod1.shape}")
54+
55+
# Handle different data storage formats
56+
if adata_mod1.X is None:
57+
# Data might be in layers
58+
if 'X' in adata_mod1.layers:
59+
X = adata_mod1.layers['X']
60+
elif len(adata_mod1.layers) > 0:
61+
# Use the first available layer
62+
layer_name = list(adata_mod1.layers.keys())[0]
63+
X = adata_mod1.layers[layer_name]
64+
print(f"Using layer: {layer_name}")
65+
else:
66+
raise ValueError("No data found in .X or .layers")
67+
else:
68+
X = adata_mod1.X
69+
70+
# Convert to dense if sparse
71+
if hasattr(X, 'toarray'):
72+
X = X.toarray()
73+
else:
74+
X = X.copy()
75+
76+
# Basic preprocessing
77+
# Normalize per cell (like sc.pp.normalize_per_cell)
78+
cell_sums = np.sum(X, axis=1, keepdims=True)
79+
cell_sums[cell_sums == 0] = 1 # Avoid division by zero
80+
X = X / cell_sums * 1e4
81+
82+
# Log1p transformation
83+
X = np.log1p(X)
84+
85+
return X
86+
87+
def select_features(X_train, y_train, n_features=2000):
88+
"""Feature selection based on correlation and variance"""
89+
print(f"Selecting top {n_features} features from {X_train.shape[1]} features")
90+
91+
# Use SelectKBest with f_regression (correlation-based)
92+
selector = SelectKBest(score_func=f_regression, k=min(n_features, X_train.shape[1]))
93+
X_selected = selector.fit_transform(X_train, y_train.mean(axis=1))
94+
95+
return X_selected, selector
96+
97+
def apply_dimensionality_reduction(X, n_components=128):
98+
"""Apply SVD for dimensionality reduction"""
99+
print(f"Applying SVD dimensionality reduction to {n_components} components")
100+
101+
svd = TruncatedSVD(n_components=min(n_components, X.shape[1], X.shape[0]-1), random_state=42)
102+
X_reduced = svd.fit_transform(X)
103+
104+
return X_reduced, svd
105+
106+
def train_model(X_train, y_train, input_dim, output_dim, n_epochs=100, lr=1e-3):
107+
"""Train the MLP model"""
108+
print(f"Training model - input_dim: {input_dim}, output_dim: {output_dim}")
109+
110+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
111+
model = MLPModel(input_dim, output_dim).to(device)
112+
113+
criterion = nn.MSELoss()
114+
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
115+
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5)
116+
117+
# Convert to tensors
118+
X_tensor = torch.FloatTensor(X_train).to(device)
119+
y_tensor = torch.FloatTensor(y_train).to(device)
120+
121+
model.train()
122+
best_loss = float('inf')
123+
patience_counter = 0
124+
125+
for epoch in range(n_epochs):
126+
optimizer.zero_grad()
127+
outputs = model(X_tensor)
128+
loss = criterion(outputs, y_tensor)
129+
loss.backward()
130+
optimizer.step()
131+
132+
current_loss = loss.item()
133+
scheduler.step(current_loss)
134+
135+
if current_loss < best_loss:
136+
best_loss = current_loss
137+
patience_counter = 0
138+
else:
139+
patience_counter += 1
140+
141+
if patience_counter >= 15: # Early stopping
142+
print(f"Early stopping at epoch {epoch}")
143+
break
144+
145+
if epoch % 20 == 0:
146+
print(f"Epoch {epoch}, Loss: {current_loss:.6f}")
147+
148+
return model
149+
150+
def create_ensemble_prediction(models, X_test, weights=None):
151+
"""Create ensemble prediction from multiple models"""
152+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
153+
X_tensor = torch.FloatTensor(X_test).to(device)
154+
155+
predictions = []
156+
157+
for model in models:
158+
model.eval()
159+
with torch.no_grad():
160+
pred = model(X_tensor).cpu().numpy()
161+
predictions.append(pred)
162+
163+
# Weighted average
164+
if weights is None:
165+
weights = np.ones(len(predictions)) / len(predictions)
166+
167+
ensemble_pred = np.average(predictions, axis=0, weights=weights)
168+
return ensemble_pred
169+
170+
def main():
171+
print("Loading input data...")
172+
173+
# Load data
174+
adata_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
175+
adata_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
176+
adata_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
177+
178+
print(f"Train mod1 shape: {adata_train_mod1.shape}")
179+
print(f"Train mod2 shape: {adata_train_mod2.shape}")
180+
print(f"Test mod1 shape: {adata_test_mod1.shape}")
181+
182+
# Preprocessing
183+
X_train = preprocess_data(adata_train_mod1)
184+
X_test = preprocess_data(adata_test_mod1, is_test=True)
185+
186+
# Target data (mod2) - apply same data loading logic as for mod1
187+
if adata_train_mod2.X is None:
188+
# Data might be in layers
189+
if 'X' in adata_train_mod2.layers:
190+
y_train = adata_train_mod2.layers['X']
191+
elif len(adata_train_mod2.layers) > 0:
192+
# Use the first available layer
193+
layer_name = list(adata_train_mod2.layers.keys())[0]
194+
y_train = adata_train_mod2.layers[layer_name]
195+
print(f"Using layer for target: {layer_name}")
196+
else:
197+
raise ValueError("No target data found in .X or .layers")
198+
else:
199+
y_train = adata_train_mod2.X
200+
201+
# Convert to dense if sparse
202+
if hasattr(y_train, 'toarray'):
203+
y_train = y_train.toarray()
204+
else:
205+
y_train = y_train.copy()
206+
207+
# Log1p transform targets
208+
y_train = np.log1p(y_train)
209+
210+
print(f"Preprocessed X_train shape: {X_train.shape}")
211+
print(f"Preprocessed y_train shape: {y_train.shape}")
212+
print(f"Preprocessed X_test shape: {X_test.shape}")
213+
214+
# Feature selection
215+
X_train_selected, feature_selector = select_features(X_train, y_train, n_features=2000)
216+
X_test_selected = feature_selector.transform(X_test)
217+
218+
# Dimensionality reduction
219+
X_train_reduced, svd_reducer = apply_dimensionality_reduction(X_train_selected, n_components=128)
220+
X_test_reduced = svd_reducer.transform(X_test_selected)
221+
222+
# Standardization
223+
scaler = StandardScaler()
224+
X_train_scaled = scaler.fit_transform(X_train_reduced)
225+
X_test_scaled = scaler.transform(X_test_reduced)
226+
227+
print(f"Final feature dimensions: {X_train_scaled.shape[1]}")
228+
229+
# Train ensemble of models (inspired by Shuji's ensemble approach)
230+
models = []
231+
n_models = 3 # Reduced for faster training
232+
233+
for i in range(n_models):
234+
print(f"Training model {i+1}/{n_models}")
235+
236+
# Add some randomness by using different random seeds
237+
np.random.seed(42 + i)
238+
torch.manual_seed(42 + i)
239+
240+
model = train_model(
241+
X_train_scaled,
242+
y_train,
243+
X_train_scaled.shape[1],
244+
y_train.shape[1],
245+
n_epochs=50, # Reduced for faster training
246+
lr=1e-3
247+
)
248+
models.append(model)
249+
250+
# Make ensemble prediction
251+
print("Making ensemble prediction...")
252+
ensemble_pred = create_ensemble_prediction(models, X_test_scaled)
253+
254+
# Inverse log1p transform
255+
ensemble_pred = np.expm1(ensemble_pred)
256+
257+
# Ensure non-negative values
258+
ensemble_pred = np.maximum(ensemble_pred, 0)
259+
260+
print(f"Prediction shape: {ensemble_pred.shape}")
261+
262+
# Create output AnnData object with proper metadata (following prediction format)
263+
adata_pred = ad.AnnData(
264+
obs=adata_test_mod1.obs[:],
265+
var=adata_train_mod2.var[:],
266+
layers={
267+
"normalized": ensemble_pred
268+
},
269+
uns={
270+
'dataset_id': adata_train_mod1.uns['dataset_id'],
271+
"method_id": meta["name"]
272+
}
273+
)
274+
275+
print(f"Prediction shape: {ensemble_pred.shape}")
276+
277+
# Save prediction
278+
adata_pred.write_h5ad(par['output'])
279+
print(f"Saved prediction to {par['output']}")
280+
281+
if __name__ == "__main__":
282+
main()

0 commit comments

Comments
 (0)