-
Notifications
You must be signed in to change notification settings - Fork 10
Description
Is it feasible to use resnet as student network and CLIP as teacher network? Their output tensors are different. Is it feasible for me to reshape?
My code is as follows:
z = self.encoder_q(img) //encode is resnet network
print("z0的shape=")
print(z.shape)# //[16,128] ,16 is batchsize
z=z.unsqueeze(2)
print("z1的shape=")
print(z.shape) //[16,128,1]
z=self.decoder(z) //一个卷积层,
print("z2的shape=")
print(z.shape) //[16,38400,1]
z=z.view(8, 50, 768) //Reshape is used here
# x_rec = self.decoder(z)
# print("x_rec的shape=")
# print(x_rec.shape)
self.feature_model.eval()
with torch.no_grad():
x = normalize_clip(unnormalize(im1))
print("x0的shape=")
print(x.shape)
# x = self.resize_func(x)
# print("x0的shape=")
# print(x.shape)
x_tgt = self.feature_model.encode_image_featuremap(x)
print("x1的shape=")
print(x_tgt.shape)
x_tgt = self.feature_model.visual.ln_post(x_tgt)
x_tgt = x_tgt.detach()
x_tgt = self.ln_tgt(x_tgt)
print("x2的shape=")
print(x_tgt.shape) //[16,50,768]
loss_FD = self.loss_feat(z, x_tgt)
loss1 = loss_FD.mean()