PIXEL-PIONEERS-TUTORIALS/jax_siamese_object_tracking.py at main · ShaliniAnandaPhD/PIXEL-PIONEERS-TUTORIALS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File name: jax_siamese_object_tracking.py
# File library: JAX, NumPy, Flax, Optax, OpenCV
# Use case: Object Tracking with Siamese Network

import jax
import jax.numpy as jnp
import numpy as np
from flax import linen as nn
from flax.training import train_state
import optax
import cv2

# Define the Siamese network architecture
class SiameseNetwork(nn.Module):
    @nn.compact
    def __call__(self, x):
        x = nn.Conv(features=64, kernel_size=(10, 10), strides=(1, 1))(x)
        x = nn.relu(x)
        x = nn.max_pool(x, window_shape=(2, 2), strides=(2, 2))
        x = nn.Conv(features=128, kernel_size=(7, 7), strides=(1, 1))(x)
        x = nn.relu(x)
        x = nn.max_pool(x, window_shape=(2, 2), strides=(2, 2))
        x = nn.Conv(features=128, kernel_size=(4, 4), strides=(1, 1))(x)
        x = nn.relu(x)
        x = nn.max_pool(x, window_shape=(2, 2), strides=(2, 2))
        x = nn.Conv(features=256, kernel_size=(4, 4), strides=(1, 1))(x)
        x = nn.relu(x)
        return x

# Define the contrastive loss function
def contrastive_loss(embeddings, labels, margin):
    distances = jnp.sqrt(jnp.sum(jnp.square(embeddings[0] - embeddings[1]), axis=1))
    loss = jnp.mean(labels * jnp.square(distances) + (1 - labels) * jnp.square(jnp.maximum(margin - distances, 0)))
    return loss

# Define the training step
@jax.jit
def train_step(state, batch):
    def loss_fn(params):
        embeddings = [SiameseNetwork().apply(params, x) for x in batch[:2]]
        loss = contrastive_loss(embeddings, batch[2], margin=1.0)
        return loss, embeddings

    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
    (loss, embeddings), grads = grad_fn(state.params)
    state = state.apply_gradients(grads=grads)
    return state, loss

# Load and preprocess the video data
def load_video_data(video_path):
    video = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = video.read()
        if not ret:
            break
        frame = cv2.resize(frame, (256, 256))
        frame = frame.astype(np.float32) / 255.0
        frames.append(frame)
    video.release()
    return jnp.array(frames)

# Generate training pairs from the video frames
def generate_pairs(frames, num_pairs):
    pairs = []
    labels = []
    for _ in range(num_pairs):
        if np.random.rand() < 0.5:
            # Positive pair (same frame, different patches)
            frame_idx = np.random.randint(len(frames))
            patch1 = frames[frame_idx, :128, :128, :]
            patch2 = frames[frame_idx, 128:, 128:, :]
            label = 1
        else:
            # Negative pair (different frames)
            frame_idx1, frame_idx2 = np.random.choice(len(frames), size=2, replace=False)
            patch1 = frames[frame_idx1, :128, :128, :]
            patch2 = frames[frame_idx2, 128:, 128:, :]
            label = 0
        pairs.append((patch1, patch2, label))
    return pairs

# Set hyperparameters
num_epochs = 10
batch_size = 32
learning_rate = 0.001

# Load and preprocess the video data
video_path = "path/to/your/video.mp4"
frames = load_video_data(video_path)

# Generate training pairs
num_pairs = 1000
train_pairs = generate_pairs(frames, num_pairs)

# Create the model and optimizer
model = SiameseNetwork()
params = model.init(jax.random.PRNGKey(0), jnp.zeros((1, 128, 128, 3)))
tx = optax.adam(learning_rate)
state = train_state.TrainState.create(apply_fn=model.apply, params=params, tx=tx)

# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0
    for i in range(0, len(train_pairs), batch_size):
        batch = train_pairs[i:i+batch_size]
        state, loss = train_step(state, batch)
        epoch_loss += loss
    epoch_loss /= (len(train_pairs) // batch_size)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

# Object tracking
def track_object(model, params, template, video_path):
    video = cv2.VideoCapture(video_path)
    template_embedding = model.apply(params, template[jnp.newaxis, ...])

    while True:
        ret, frame = video.read()
        if not ret:
            break

        # Slide the template over the frame and compute similarities
        similarities = []
        for i in range(0, frame.shape[0] - template.shape[0], 4):
            for j in range(0, frame.shape[1] - template.shape[1], 4):
                patch = frame[i:i+template.shape[0], j:j+template.shape[1], :]
                patch_embedding = model.apply(params, patch[jnp.newaxis, ...])
                similarity = jnp.sum(template_embedding * patch_embedding)
                similarities.append(((i, j), similarity))

        # Find the patch with the highest similarity
        best_match = max(similarities, key=lambda x: x[1])[0]

        # Draw bounding box around the tracked object
        x, y = best_match
        cv2.rectangle(frame, (x, y), (x + template.shape[1], y + template.shape[0]), (0, 255, 0), 2)

        cv2.imshow("Object Tracking", frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    video.release()
    cv2.destroyAllWindows()

# Select the object template from the first frame
template = frames[0, :128, :128, :]

# Perform object tracking on a new video
tracking_video_path = "path/to/your/tracking_video.mp4"
track_object(model, state.params, template, tracking_video_path)

# Possible errors and solutions:
# 1. OpenCV video file not found:
#    Error: "error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'"
#    Solution: Ensure the video file path is correct and the file exists.

# 2. Shape mismatch errors during training:
#    Error: "ValueError: operands could not be broadcast together with shapes..."
#    Solution: Check the shapes of the inputs and labels to ensure they are compatible. Use appropriate reshaping or padding if necessary.

# 3. Slow training or convergence issues:
#    Solution: Experiment with different learning rates, batch sizes, or network architectures. Use a smaller model or fewer parameters if the training is too slow.