-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopencv_demo.py
More file actions
284 lines (250 loc) · 11.2 KB
/
opencv_demo.py
File metadata and controls
284 lines (250 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import collections
import os
import time
import sys
from typing import List
import cv2
import numpy as np
try:
import tensorflow as tf
except ImportError:
print('[ERROR] TensorFlow not installed. Install: pip install tensorflow')
sys.exit(1)
try:
import mediapipe as mp
MP_AVAILABLE = True
except ImportError:
MP_AVAILABLE = False
MODEL_PATH = 'cnn_sign_language_model.h5'
SMOOTH_WINDOW = 3 # reduced for faster adaptation to new gesture
MIN_CONFIDENCE = 0.5
TEXT_FG_COLOR = (0, 255, 255)
TEXT_BG_COLOR = (0, 0, 0)
FPS_FG_COLOR = (255, 255, 0)
DEFAULT_MIRROR = True
DETECTION_INTERVAL = 2
INFERENCE_INTERVAL = 2
DETECTION_DOWNSCALE = 1.0
# Extra padding around detected hand to avoid cropping fingers (increase if misclassifying)
HAND_PADDING = 0.35 # fraction of bbox expansion inside detect_hand_bbox logic override
CAPTURE_WIDTH = 640
CAPTURE_HEIGHT = 480
ENABLE_GPU_MEMORY_GROWTH = True
DEFAULT_LABELS_24 = ['A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y']
def load_labels(num_classes: int) -> List[str]:
if num_classes == 24:
return DEFAULT_LABELS_24
return [str(i) for i in range(num_classes)]
def majority_vote(buf: collections.deque) -> int:
if not buf:
return -1
return collections.Counter(buf).most_common(1)[0][0]
def preprocess_roi(bgr: np.ndarray, size: int, channels: int, grayscale: bool) -> np.ndarray:
if grayscale:
g = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
g = cv2.resize(g, (size, size), interpolation=cv2.INTER_AREA)
if channels == 1:
arr = g[..., None]
else:
arr = cv2.cvtColor(g, cv2.COLOR_GRAY2BGR)
else:
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
arr = cv2.resize(rgb, (size, size), interpolation=cv2.INTER_AREA)
return (arr.astype('float32') / 255.0)
def detect_hand_bbox(frame_rgb, hands, w, h, padding=0.15):
result = hands.process(frame_rgb)
if not result.multi_hand_landmarks:
return None
lm = result.multi_hand_landmarks[0]
xs = [p.x for p in lm.landmark]
ys = [p.y for p in lm.landmark]
min_x, max_x = min(xs), max(xs)
min_y, max_y = min(ys), max(ys)
dx = (max_x - min_x) * padding
dy = (max_y - min_y) * padding
min_x = max(0.0, min_x - dx); max_x = min(1.0, max_x + dx)
min_y = max(0.0, min_y - dy); max_y = min(1.0, max_y + dy)
return int(min_x * w), int(min_y * h), int(max_x * w), int(max_y * h)
def main():
if not os.path.exists(MODEL_PATH):
print(f'[ERROR] Model file not found: {MODEL_PATH}')
return
if ENABLE_GPU_MEMORY_GROWTH:
try:
gpus = tf.config.list_physical_devices('GPU')
for g in gpus:
tf.config.experimental.set_memory_growth(g, True)
if gpus:
print(f'[INFO] Enabled GPU memory growth for {len(gpus)} GPU(s)')
except Exception as e:
print(f'[WARN] GPU memory growth not set: {e}')
print(f'[INFO] Loading model {MODEL_PATH}')
model = tf.keras.models.load_model(MODEL_PATH)
in_shape = model.inputs[0].shape
if len(in_shape) != 4:
print('[ERROR] Unexpected input shape:', in_shape)
return
_, H, W, C = in_shape
target_size = int(min(H, W))
grayscale = (int(C) == 1)
num_classes = int(model.outputs[0].shape[-1])
labels = load_labels(num_classes)
print(f'[INFO] Classes: {num_classes} -> {labels}')
if MP_AVAILABLE:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1,
min_detection_confidence=0.5, min_tracking_confidence=0.5)
use_mediapipe = True
print('[INFO] MediaPipe enabled')
else:
hands = None
use_mediapipe = False
print('[INFO] MediaPipe not installed; using center ROI')
cap = cv2.VideoCapture(0)
if CAPTURE_WIDTH > 0 and CAPTURE_HEIGHT > 0:
cap.set(cv2.CAP_PROP_FRAME_WIDTH, CAPTURE_WIDTH)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, CAPTURE_HEIGHT)
if not cap.isOpened():
print('[ERROR] Cannot access webcam')
return
fallback_roi = None
# Debug features
debug = False # toggled with 'g'
last_debug_print = 0
TOPK = 5
mirror = DEFAULT_MIRROR
pred_buf = collections.deque(maxlen=SMOOTH_WINDOW)
last_label = ''
last_conf = 0.0
last_pred_idx = -1
last_probs = None
last_bbox = None
last_detect_frame = -999
last_infer_frame = -999
fps_t = time.time()
frame_i = 0
try:
while True:
ok, frame = cap.read()
if not ok:
print('[WARN] Frame read failed')
break
frame_i += 1
if mirror:
frame = cv2.flip(frame, 1)
h0, w0 = frame.shape[:2]
if use_mediapipe:
if frame_i - last_detect_frame >= DETECTION_INTERVAL:
det_frame = frame
if DETECTION_DOWNSCALE > 1.0:
ds_w = int(w0 / DETECTION_DOWNSCALE)
ds_h = int(h0 / DETECTION_DOWNSCALE)
det_small = cv2.resize(frame, (ds_w, ds_h), interpolation=cv2.INTER_LINEAR)
rgb = cv2.cvtColor(det_small, cv2.COLOR_BGR2RGB)
bbox_small = detect_hand_bbox(rgb, hands, ds_w, ds_h, padding=HAND_PADDING)
if bbox_small:
x1s, y1s, x2s, y2s = bbox_small
scale = DETECTION_DOWNSCALE
last_bbox = (int(x1s*scale), int(y1s*scale), int(x2s*scale), int(y2s*scale))
else:
last_bbox = None
else:
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
last_bbox = detect_hand_bbox(rgb, hands, w0, h0, padding=HAND_PADDING)
last_detect_frame = frame_i
bbox = last_bbox
else:
if fallback_roi is None:
side = int(min(w0, h0) * 0.6)
x1 = (w0 - side) // 2
y1 = (h0 - side) // 2
fallback_roi = (x1, y1, x1 + side, y1 + side)
bbox = fallback_roi
if bbox:
x1, y1, x2, y2 = bbox
x1 = max(0, x1); y1 = max(0, y1); x2 = min(w0, x2); y2 = min(h0, y2)
if x2 > x1 and y2 > y1:
if frame_i - last_infer_frame >= INFERENCE_INTERVAL:
roi = frame[y1:y2, x1:x2]
proc = preprocess_roi(roi, target_size, 1, grayscale=True)
batch = np.expand_dims(proc, 0)
last_probs = model.predict(batch, verbose=0)[0]
last_pred_idx = int(np.argmax(last_probs))
last_infer_frame = frame_i
if debug:
# Save occasional ROI for inspection
if frame_i % 50 == 0:
os.makedirs('debug_rois', exist_ok=True)
cv2.imwrite(f'debug_rois/roi_{frame_i}.png', roi)
# Simple stats check
if frame_i - last_debug_print > 30:
print(f'[DBG] proc stats min={proc.min():.3f} max={proc.max():.3f} mean={proc.mean():.3f}')
last_debug_print = frame_i
if last_probs is not None:
idx = last_pred_idx
conf = float(last_probs[idx])
pred_buf.append(idx)
voted = majority_vote(pred_buf)
voted_conf = conf if voted == idx else last_probs[voted]
last_label = labels[voted] if voted < len(labels) else str(voted)
last_conf = float(voted_conf)
color = (0,200,0) if last_conf >= MIN_CONFIDENCE else (0,0,255)
cv2.rectangle(frame, (x1,y1), (x2,y2), color, 2)
if debug and last_probs is not None:
# Compose top-K prediction string
topk_idx = np.argsort(last_probs)[-TOPK:][::-1]
overlay_lines = [f'{labels[i]}:{last_probs[i]:.2f}' for i in topk_idx]
for li, line in enumerate(overlay_lines):
cv2.putText(frame, line, (x2+10, y1 + 20 + li*18), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (50,255,50), 1, cv2.LINE_AA)
# Show preprocessed patch scaled for visual check
preview = (proc if proc.shape[-1]==3 else proc[...,0])
preview_disp = cv2.resize((preview*255).astype('uint8'), (112,112), interpolation=cv2.INTER_NEAREST)
if len(preview_disp.shape)==2:
preview_disp = cv2.cvtColor(preview_disp, cv2.COLOR_GRAY2BGR)
# Put in top-right corner
ph, pw = preview_disp.shape[:2]
fh, fw = frame.shape[:2]
frame[0:ph, fw-pw:fw] = preview_disp
else:
last_label = ''
else:
last_label = ''
if frame_i % 10 == 0:
now = time.time()
fps = 10.0 / (now - fps_t)
fps_t = now
else:
fps = None
text = f'{last_label} {last_conf*100:.1f}%' if last_label else ('Detecting...' if use_mediapipe else 'Place hand')
cv2.putText(frame, text, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, TEXT_BG_COLOR, 4, cv2.LINE_AA)
cv2.putText(frame, text, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, TEXT_FG_COLOR, 2, cv2.LINE_AA)
if fps is not None:
fps_text = f'FPS: {fps:.1f}'
cv2.putText(frame, fps_text, (10,60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, TEXT_BG_COLOR, 3, cv2.LINE_AA)
cv2.putText(frame, fps_text, (10,60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, FPS_FG_COLOR, 1, cv2.LINE_AA)
cv2.imshow('Sign Inference (CNN)', frame)
k = cv2.waitKey(1) & 0xFF
if k in (27, ord('q')):
break
elif k == ord('r') and not use_mediapipe:
fallback_roi = None
print('[INFO] ROI reset')
elif k == ord('f'):
mirror = not mirror
print(f'[INFO] Mirror set to {mirror}')
elif k == ord('d') and use_mediapipe:
if DETECTION_INTERVAL == 1:
print('[INFO] Detection interval already 1 (cannot toggle at runtime without code change)')
else:
print(f'[INFO] Detection interval fixed at {DETECTION_INTERVAL} (change constant and restart)')
elif k == ord('g'):
debug = not debug
print(f'[INFO] Debug mode = {debug}')
finally:
cap.release()
cv2.destroyAllWindows()
if use_mediapipe:
hands.close()
print('[INFO] Exit')
if __name__ == '__main__':
main()