Skip to content

Commit 9a1d672

Browse files
add novel (#2)
* add novel Co-authored-by: Kai Waldrant <[email protected]> * fix config * update novel --------- Co-authored-by: Kai Waldrant <[email protected]>
1 parent 39f5cec commit 9a1d672

File tree

10 files changed

+623
-2
lines changed

10 files changed

+623
-2
lines changed

scripts/create_datasets/test_resources.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,31 @@ nextflow run . \
3030
echo "Run one method"
3131

3232
for name in bmmc_cite/normal bmmc_cite/swap bmmc_multiome/normal bmmc_multiome/swap; do
33+
echo "Run KNN on $name"
3334
viash run src/methods/knnr_py/config.vsh.yaml -- \
3435
--input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \
3536
--input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \
3637
--input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \
3738
--output $OUTPUT_DIR/openproblems_neurips2021/$name/prediction.h5ad
3839

39-
# pre-train simple_mlp
40-
rm -r $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
40+
echo "pre-train simple_mlp on $name"
41+
[ -d $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/ ] && rm -r $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
4142
mkdir -p $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
4243
viash run src/methods/simple_mlp/train/config.vsh.yaml -- \
4344
--input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \
4445
--input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \
4546
--input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \
4647
--output $OUTPUT_DIR/openproblems_neurips2021/$name/models/simple_mlp/
48+
49+
echo "pre-train novel on $name"
50+
[ -d $OUTPUT_DIR/openproblems_neurips2021/$name/models/novel/ ] && rm -r $OUTPUT_DIR/openproblems_neurips2021/$name/models/novel/
51+
mkdir -p $OUTPUT_DIR/openproblems_neurips2021/$name/models/novel/
52+
viash run src/methods/novel/train/config.vsh.yaml -- \
53+
--input_train_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod1.h5ad \
54+
--input_train_mod2 $OUTPUT_DIR/openproblems_neurips2021/$name/train_mod2.h5ad \
55+
--input_test_mod1 $OUTPUT_DIR/openproblems_neurips2021/$name/test_mod1.h5ad \
56+
--output $OUTPUT_DIR/openproblems_neurips2021/$name/models/novel
57+
4758
done
4859

4960
# only run this if you have access to the openproblems-data bucket
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
import torch
2+
3+
from torch import nn
4+
import torch.nn.functional as F
5+
6+
from torch.utils.data import Dataset
7+
8+
from typing import Optional
9+
10+
import anndata
11+
import numpy as np
12+
import pandas as pd
13+
import scipy.sparse
14+
import sklearn.decomposition
15+
import sklearn.feature_extraction.text
16+
import sklearn.preprocessing
17+
import sklearn.neighbors
18+
import sklearn.utils.extmath
19+
20+
class tfidfTransformer():
21+
def __init__(self):
22+
self.idf = None
23+
self.fitted = False
24+
25+
def fit(self, X):
26+
self.idf = X.shape[0] / X.sum(axis=0)
27+
self.fitted = True
28+
29+
def transform(self, X):
30+
if not self.fitted:
31+
raise RuntimeError('Transformer was not fitted on any data')
32+
if scipy.sparse.issparse(X):
33+
tf = X.multiply(1 / X.sum(axis=1))
34+
return tf.multiply(self.idf)
35+
else:
36+
tf = X / X.sum(axis=1, keepdims=True)
37+
return tf * self.idf
38+
39+
def fit_transform(self, X):
40+
self.fit(X)
41+
return self.transform(X)
42+
43+
class lsiTransformer():
44+
def __init__(self,
45+
n_components: int = 20,
46+
use_highly_variable = None
47+
):
48+
self.n_components = n_components
49+
self.use_highly_variable = use_highly_variable
50+
self.tfidfTransformer = tfidfTransformer()
51+
self.normalizer = sklearn.preprocessing.Normalizer(norm="l1")
52+
self.pcaTransformer = sklearn.decomposition.TruncatedSVD(n_components = self.n_components, random_state=777)
53+
# self.lsi_mean = None
54+
# self.lsi_std = None
55+
self.fitted = None
56+
57+
def fit(self, adata: anndata.AnnData):
58+
if self.use_highly_variable is None:
59+
self.use_highly_variable = "hvg" in adata.var
60+
adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata
61+
X = self.tfidfTransformer.fit_transform(adata_use.X)
62+
X_norm = self.normalizer.fit_transform(X)
63+
X_norm = np.log1p(X_norm * 1e4)
64+
X_lsi = self.pcaTransformer.fit_transform(X_norm)
65+
# self.lsi_mean = X_lsi.mean(axis=1, keepdims=True)
66+
# self.lsi_std = X_lsi.std(axis=1, ddof=1, keepdims=True)
67+
self.fitted = True
68+
69+
def transform(self, adata):
70+
if not self.fitted:
71+
raise RuntimeError('Transformer was not fitted on any data')
72+
adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata
73+
X = self.tfidfTransformer.transform(adata_use.X)
74+
X_norm = self.normalizer.transform(X)
75+
X_norm = np.log1p(X_norm * 1e4)
76+
X_lsi = self.pcaTransformer.transform(X_norm)
77+
X_lsi -= X_lsi.mean(axis=1, keepdims=True)
78+
X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True)
79+
lsi_df = pd.DataFrame(X_lsi, index = adata_use.obs_names)
80+
return lsi_df
81+
82+
def fit_transform(self, adata):
83+
self.fit(adata)
84+
return self.transform(adata)
85+
86+
class ModalityMatchingDataset(Dataset):
87+
def __init__(
88+
self, df_modality1, df_modality2, is_train=True
89+
):
90+
super().__init__()
91+
self.df_modality1 = df_modality1
92+
self.df_modality2 = df_modality2
93+
self.is_train = is_train
94+
def __len__(self):
95+
return self.df_modality1.shape[0]
96+
97+
def __getitem__(self, index: int):
98+
if self.is_train == True:
99+
x = self.df_modality1.iloc[index].values
100+
y = self.df_modality2.iloc[index].values
101+
return x, y
102+
else:
103+
x = self.df_modality1.iloc[index].values
104+
return x
105+
106+
class Swish(torch.autograd.Function):
107+
@staticmethod
108+
def forward(ctx, i):
109+
result = i * sigmoid(i)
110+
ctx.save_for_backward(i)
111+
return result
112+
@staticmethod
113+
def backward(ctx, grad_output):
114+
i = ctx.saved_variables[0]
115+
sigmoid_i = sigmoid(i)
116+
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
117+
118+
class Swish_module(nn.Module):
119+
def forward(self, x):
120+
return Swish.apply(x)
121+
122+
sigmoid = torch.nn.Sigmoid()
123+
124+
class ModelRegressionGex2Atac(nn.Module):
125+
def __init__(self, dim_mod1, dim_mod2):
126+
super(ModelRegressionGex2Atac, self).__init__()
127+
#self.bn = torch.nn.BatchNorm1d(1024)
128+
self.input_ = nn.Linear(dim_mod1, 1024)
129+
self.fc = nn.Linear(1024, 256)
130+
self.fc1 = nn.Linear(256, 2048)
131+
self.dropout1 = nn.Dropout(p=0.298885630228993)
132+
self.dropout2 = nn.Dropout(p=0.11289717442776658)
133+
self.dropout3 = nn.Dropout(p=0.13523634924414762)
134+
self.output = nn.Linear(2048, dim_mod2)
135+
def forward(self, x):
136+
x = F.gelu(self.input_(x))
137+
x = self.dropout1(x)
138+
x = F.gelu(self.fc(x))
139+
x = self.dropout2(x)
140+
x = F.gelu(self.fc1(x))
141+
x = self.dropout3(x)
142+
x = F.gelu(self.output(x))
143+
return x
144+
145+
class ModelRegressionAtac2Gex(nn.Module): #
146+
def __init__(self, dim_mod1, dim_mod2):
147+
super(ModelRegressionAtac2Gex, self).__init__()
148+
self.input_ = nn.Linear(dim_mod1, 2048)
149+
self.fc = nn.Linear(2048, 2048)
150+
self.fc1 = nn.Linear(2048, 512)
151+
self.dropout1 = nn.Dropout(p=0.2649138776004753)
152+
self.dropout2 = nn.Dropout(p=0.1769628308148758)
153+
self.dropout3 = nn.Dropout(p=0.2516791883012817)
154+
self.output = nn.Linear(512, dim_mod2)
155+
def forward(self, x):
156+
x = F.gelu(self.input_(x))
157+
x = self.dropout1(x)
158+
x = F.gelu(self.fc(x))
159+
x = self.dropout2(x)
160+
x = F.gelu(self.fc1(x))
161+
x = self.dropout3(x)
162+
x = F.gelu(self.output(x))
163+
return x
164+
165+
class ModelRegressionAdt2Gex(nn.Module):
166+
def __init__(self, dim_mod1, dim_mod2):
167+
super(ModelRegressionAdt2Gex, self).__init__()
168+
self.input_ = nn.Linear(dim_mod1, 512)
169+
self.dropout1 = nn.Dropout(p=0.0)
170+
self.swish = Swish_module()
171+
self.fc = nn.Linear(512, 512)
172+
self.fc1 = nn.Linear(512, 512)
173+
self.fc2 = nn.Linear(512, 512)
174+
self.output = nn.Linear(512, dim_mod2)
175+
def forward(self, x):
176+
x = F.gelu(self.input_(x))
177+
x = F.gelu(self.fc(x))
178+
x = F.gelu(self.fc1(x))
179+
x = F.gelu(self.fc2(x))
180+
x = F.gelu(self.output(x))
181+
return x
182+
183+
class ModelRegressionGex2Adt(nn.Module):
184+
def __init__(self, dim_mod1, dim_mod2):
185+
super(ModelRegressionGex2Adt, self).__init__()
186+
self.input_ = nn.Linear(dim_mod1, 512)
187+
self.dropout1 = nn.Dropout(p=0.20335661386636347)
188+
self.dropout2 = nn.Dropout(p=0.15395289261127876)
189+
self.dropout3 = nn.Dropout(p=0.16902655078832815)
190+
self.fc = nn.Linear(512, 512)
191+
self.fc1 = nn.Linear(512, 2048)
192+
self.output = nn.Linear(2048, dim_mod2)
193+
def forward(self, x):
194+
# x = self.batchswap_noise(x)
195+
x = F.gelu(self.input_(x))
196+
x = self.dropout1(x)
197+
x = F.gelu(self.fc(x))
198+
x = self.dropout2(x)
199+
x = F.gelu(self.fc1(x))
200+
x = self.dropout3(x)
201+
x = F.gelu(self.output(x))
202+
return x
203+
204+
def rmse(y, y_pred):
205+
return np.sqrt(np.mean(np.square(y - y_pred)))
206+
207+
def train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, name_model, device):
208+
best_score = 100000
209+
for i in range(100):
210+
train_losses = []
211+
test_losses = []
212+
model.train()
213+
214+
for x, y in dataloader_train:
215+
optimizer.zero_grad()
216+
output = model(x.float().to(device))
217+
loss = torch.sqrt(loss_fn(output, y.float().to(device)))
218+
loss.backward()
219+
train_losses.append(loss.item())
220+
optimizer.step()
221+
222+
model.eval()
223+
with torch.no_grad():
224+
for x, y in dataloader_test:
225+
output = model(x.float().to(device))
226+
output[output<0] = 0.0
227+
loss = torch.sqrt(loss_fn(output, y.float().to(device)))
228+
test_losses.append(loss.item())
229+
230+
outputs = []
231+
targets = []
232+
model.eval()
233+
with torch.no_grad():
234+
for x, y in dataloader_test:
235+
output = model(x.float().to(device))
236+
237+
outputs.append(output.detach().cpu().numpy())
238+
targets.append(y.float().detach().cpu().numpy())
239+
cat_outputs = np.concatenate(outputs)
240+
cat_targets = np.concatenate(targets)
241+
cat_outputs[cat_outputs<0.0] = 0
242+
243+
if best_score > rmse(cat_targets,cat_outputs):
244+
torch.save(model.state_dict(), name_model)
245+
best_score = rmse(cat_targets,cat_outputs)
246+
print("best rmse: ", best_score)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
__merge__: ../../../api/comp_method_predict.yaml
2+
name: novel_predict
3+
4+
info:
5+
test_setup:
6+
with_model:
7+
input_model: resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/models/novel
8+
9+
resources:
10+
- type: python_script
11+
path: script.py
12+
- path: ../helper_functions.py
13+
engines:
14+
- type: docker
15+
image: openproblems/base_pytorch_nvidia:1.0.0
16+
setup:
17+
- type: python
18+
packages:
19+
- scikit-learn
20+
- networkx
21+
runners:
22+
- type: executable
23+
- type: nextflow
24+
directives:
25+
label: [highmem, hightime, midcpu, highsharedmem, gpu]
26+

0 commit comments

Comments
 (0)