A-Low-Bitrate-Optical-Flow-based-Neural-Video-Compressor/train.py at main · Lucabr01/A-Low-Bitrate-Optical-Flow-based-Neural-Video-Compressor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Logica training joint


"""
Main training script for the Neural Video Compression framework.

This script performs the Joint Fine-Tuning (Stage 3) of the entire pipeline:
- Motion VAE (Pre-trained on Vimeo-Triplet/HardMode) -> Unfrozen
- Residual VAE (Pre-trained on Vimeo-Septuplet) -> Unfrozen
- Adaptive Fusion Module -> Learned from scratch

It requires pre-trained weights in the 'weights/' directory to initialize the backbone.
"""

import os
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Import local modules from src/
from src.models import ScaleHyperprior, MotionRefineNET, ResRefiNET, AdaptiveRefiNET
from src.dataset import VimeoSeptupletDataset, VimeoTripletSkipDataset, VimeoHardModeDataset
from src.loss import RateDistortionLoss
from src.utils import robust_load, compute_adaptive_mask, flow_warp
from configs.config import Config

# Try to import RAFT (optional for showcase, but good to have)
try:
    from torchvision.models.optical_flow import raft_small
    RAFT_AVAILABLE = True
except ImportError:
    RAFT_AVAILABLE = False
    print("  RAFT not found. Using dummy flow for demonstration.")

def train_joint():
    # 1. SETUP DEVICE & DIRS
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f" Starting Joint Fine-Tuning on {device}")

    os.makedirs(Config.CHECKPOINT_DIR, exist_ok=True)

    # 2. INITIALIZE ARCHITECTURE (The "Pentagon")
    print("  Building Model Architecture...")

    # --- MOTION BRANCH (Larger Capacity: N=192) ---
    # Based on pre-trained weights 'FlowVAE_finetune_ep11.pth'
    print(" Init Motion Model (N=192, M=192)")
    motion_model = ScaleHyperprior(N=192, M=192, in_channels=2, out_channels=2).to(device)

    # Refines the optical flow (2 channels) using history (12 channels)
    refine_model = MotionRefineNET().to(device)

    # --- RESIDUAL BRANCH (Standard Capacity: N=128) ---
    # Based on pre-trained weights 'ResidualVAE_HardMode_Ep4.pth'
    print(" Init Residual Model (N=128, M=128)")
    residual_model = ScaleHyperprior(N=128, M=128, in_channels=3, out_channels=3).to(device)

    # --- RECONSTRUCTION BRANCH ---
    adaptive_model = AdaptiveRefiNET().to(device)
    post_model = ResRefiNET().to(device)

    # Optical Flow Backbone (RAFT Small - Frozen)
    if RAFT_AVAILABLE:
        raft = raft_small(pretrained=True).to(device).eval()
        for p in raft.parameters(): p.requires_grad = False

    # 3. LOAD PRE-TRAINED WEIGHTS (Stage-wise Initialization)
    print(" Loading Pre-trained Weights...")
    robust_load(motion_model,   Config.PRETRAINED_WEIGHTS['motion'])
    robust_load(refine_model,   Config.PRETRAINED_WEIGHTS['refine'])
    robust_load(residual_model, Config.PRETRAINED_WEIGHTS['residual'])

    # Adaptive & Post might be new or pre-trained
    if os.path.exists(Config.PRETRAINED_WEIGHTS['adaptive']):
        robust_load(adaptive_model, Config.PRETRAINED_WEIGHTS['adaptive'])
    else:
        print(" Adaptive weights not found (starting from scratch)")

    if os.path.exists(Config.PRETRAINED_WEIGHTS['post']):
        robust_load(post_model, Config.PRETRAINED_WEIGHTS['post'])
    else:
        print(" Post-process weights not found (starting from scratch)")

    # 4. OPTIMIZER SETUP (Split Net vs Aux)
    # Critical: CompressAI models have "auxiliary" parameters (quantiles)
    # that must be optimized separately to estimate entropy correctly.

    net_params = []
    aux_params = []

    # Collect parameters from all trainable modules
    model_list = [motion_model, refine_model, residual_model, adaptive_model, post_model]

    for model in model_list:
        for n, p in model.named_parameters():
            if p.requires_grad:
                if n.endswith(".quantiles"):
                    aux_params.append(p)
                else:
                    net_params.append(p)

    # Main Optimizer (Weights)
    optimizer = optim.AdamW(net_params, lr=Config.LEARNING_RATE)

    # Aux Optimizer (Entropy Bottleneck parameters) - usually requires higher LR (1e-3)
    aux_optimizer = optim.Adam(aux_params, lr=1e-3)

    criterion = RateDistortionLoss(lambda_val=Config.LAMBDA_RD)

    # 5. DATASET SELECTION
    print(f" Loading Dataset... [Mode: {Config.DATASET_MODE}]")

    if Config.DATASET_MODE == "septuplet":
        dataset = VimeoSeptupletDataset(Config.DATASET_DIR, split='train', crop_size=Config.CROP_SIZE)
    elif Config.DATASET_MODE == "triplet":
        dataset = VimeoTripletSkipDataset(Config.DATASET_DIR, split='train', crop_size=Config.CROP_SIZE)
    elif Config.DATASET_MODE == "hard_mode":
        dataset = VimeoHardModeDataset(Config.DATASET_DIR, gap=Config.HARD_MODE_GAP, split='train', crop_size=Config.CROP_SIZE)
    else:
        raise ValueError(f"Unknown dataset mode: {Config.DATASET_MODE}")

    dataloader = DataLoader(dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)

    # 6. TRAINING LOOP
    print(f" Starting Training Loop for {Config.EPOCHS} Epochs...")

    # Set models to train mode
    for model in model_list:
        model.train()

    for epoch in range(Config.EPOCHS):
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{Config.EPOCHS}")

        for batch_idx, (frame1, frame2) in enumerate(loop):
            frame1, frame2 = frame1.to(device), frame2.to(device)

            optimizer.zero_grad()
            aux_optimizer.zero_grad()

            # --- A. OPTICAL FLOW ESTIMATION ---
            if RAFT_AVAILABLE:
                with torch.no_grad():
                    # RAFT estimation (im1 -> im2)
                    flow_pred = raft(frame1, frame2)[-1]
            else:
                # Dummy flow
                flow_pred = torch.zeros(frame1.shape[0], 2, frame1.shape[2], frame1.shape[3]).to(device)

            # --- B. MOTION COMPRESSION ---
            motion_out = motion_model(flow_pred)
            flow_hat = motion_out["x_hat"]
            motion_likelihoods = motion_out["likelihoods"]

            # --- C. REFINEMENT (Corrected Logic) ---
            # We refine the FLOW, not the image.
            # Input: Flow (2) + History (12) = 14 Channels.
            # This matches the pretrained weights of RefineNET.
            history_dummy = torch.zeros(frame1.shape[0], 12, frame1.shape[2], frame1.shape[3]).to(device)

            flow_refined = refine_model(flow_hat, history_dummy)

            # --- D. WARPING (Using Refined Flow) ---
            # Warp Frame1 using the high-quality refined flow
            frame2_pred = flow_warp(frame1, flow_refined)

            # --- E. RESIDUAL CODING ---
            # Calculate residual against the refined prediction
            residual = frame2 - frame2_pred

            res_out = residual_model(residual)
            res_hat = res_out["x_hat"]
            res_likelihoods = res_out["likelihoods"]

            recon_residual = res_hat

            # --- F. ADAPTIVE FUSION ---
            mask = compute_adaptive_mask(recon_residual)

            # Fusion uses the reconstructed residual and the warped prediction
            frame_recon = adaptive_model(recon_residual, frame2_pred, mask, history_dummy)

            # --- G. POST-PROCESSING ---
            final_image = post_model(frame_recon)

            # --- H. MAIN LOSS & BACKPROP ---
            combined_likelihoods = {**motion_likelihoods, **res_likelihoods}
            output_dict = {
                "x_hat": final_image,
                "likelihoods": combined_likelihoods
            }

            loss, dist, bpp = criterion(output_dict, frame2)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(net_params, 1.0)
            optimizer.step()

            # --- I. AUXILIARY LOSS OPTIMIZATION (Crucial Step) ---
            # Update the entropy bottleneck parameters (CDF tables)
            aux_loss = motion_model.aux_loss() + residual_model.aux_loss()
            aux_loss.backward()
            aux_optimizer.step()

            # Update Progress Bar
            loop.set_postfix(loss=loss.item(), bpp=bpp.item(), dist=dist.item(), aux=aux_loss.item())

        # Save Checkpoint
        save_path = os.path.join(Config.CHECKPOINT_DIR, f"joint_model_ep{epoch+1}.pth")

        torch.save({
            'epoch': epoch,
            'motion_state': motion_model.state_dict(),
            'residual_state': residual_model.state_dict(),
            'adaptive_state': adaptive_model.state_dict(),
            'post_state': post_model.state_dict(),
            'optimizer': optimizer.state_dict()
        }, save_path)

        print(f" Checkpoint saved: {save_path}")

if __name__ == "__main__":
    train_joint()