walky-talky/TTS.py at main · Srihari-vr/walky-talky · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import cv2
import time
import urllib.request
import numpy as np

# --- Configuration ---
# IMPORTANT: Replace with the IP address shown in the ESP32 Serial Monitor
ESP32_CAM_IP = "esp32camurl"  # <--- CHANGE ME
STREAM_URL = f'http://{ESP32_CAM_IP}/stream'

# --- YOLO Model Files (YOLOv3-Tiny is recommended for Raspberry Pi performance) ---
# You need to download these files and place them in the same directory as this script.
# Search online for "YOLOv3-tiny" configuration files.
CONFIG_FILE = "yolov3-tiny.cfg"
WEIGHTS_FILE = "yolov3-tiny.weights"
NAMES_FILE = "coco.names"

# --- Constants ---
CONFIDENCE_THRESHOLD = 0.5
NMS_THRESHOLD = 0.3 # Non-Maximum Suppression threshold
INPUT_SIZE = 320 # Smaller size for faster inference on Pi (320x320 is good for tiny models)


def initialize_yolo_model(config_path, weights_path, names_path):
    """Loads the YOLO model and class names."""
    try:
        # Load class names
        with open(names_path, 'r') as f:
            classes = [line.strip() for line in f.readlines()]

        # Load the network using OpenCV's DNN module
        net = cv2.dnn.readNetFromDarknet(config_path, weights_path)

        # Determine the output layer names
        layer_names = net.getLayerNames()
        output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

        # Set target backend and target device for Raspberry Pi CPU optimization
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

        print("YOLO model loaded successfully. Ready for detection.")
        return net, classes, output_layers

    except Exception as e:
        print(f"Error loading YOLO files. Check paths and file integrity: {e}")
        print("Please ensure you have the following files in the script directory:")
        print(f" - {CONFIG_FILE}")
        print(f" - {WEIGHTS_FILE}")
        print(f" - {NAMES_FILE}")
        return None, None, None


def process_frame(net, classes, output_layers, frame):
    """Runs object detection on a single frame."""
    height, width, _ = frame.shape

    # Create blob from the frame and set it as input for the network
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (INPUT_SIZE, INPUT_SIZE), swapRB=True, crop=False)
    net.setInput(blob)

    # Run forward pass (inference)
    layer_outputs = net.forward(output_layers)

    # Initialize lists for detected bounding boxes, confidences, and class IDs
    boxes = []
    confidences = []
    class_ids = []

    # Process each output layer
    for output in layer_outputs:
        for detection in output:
            # The first 5 values are center_x, center_y, width, height, and object confidence
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            if confidence > CONFIDENCE_THRESHOLD:
                # YOLO output is normalized to the input size
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    # Apply Non-Maximum Suppression (NMS) to remove overlapping redundant boxes
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, CONFIDENCE_THRESHOLD, NMS_THRESHOLD)

    # Draw bounding boxes and labels
    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            confidence = confidences[i]
            color = (0, 255, 0) # Green box

            # Draw rectangle
            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)

            # Draw label and confidence
            text = f"{label}: {confidence:.2f}"
            cv2.putText(frame, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    return frame


def run_detector():
    """Main function to capture the stream and run detection."""

    # 1. Initialize YOLO
    net, classes, output_layers = initialize_yolo_model(CONFIG_FILE, WEIGHTS_FILE, NAMES_FILE)
    if net is None:
        return

    # 2. Initialize Video Stream Capture
    cap = cv2.VideoCapture(STREAM_URL)

    # Check if stream opened successfully
    if not cap.isOpened():
        print(f"Error: Could not connect to ESP32 stream at {STREAM_URL}")
        return

    print("Successfully connected to ESP32 stream. Starting loop...")

    frame_count = 0
    start_time = time.time()

    try:
        while True:
            # Read a frame from the video stream
            ret, frame = cap.read()

            if not ret:
                print("Stream ended or frame could not be read. Attempting reconnect...")
                cap = cv2.VideoCapture(STREAM_URL) # Attempt to reconnect
                time.sleep(1)
                continue

            # Run detection on the captured frame
            processed_frame = process_frame(net, classes, output_layers, frame)

            # Display the resulting frame
            cv2.imshow("ESP32-CAM YOLO Detection", processed_frame)

            # Calculate FPS
            frame_count += 1
            if frame_count % 10 == 0:
                end_time = time.time()
                fps = 10 / (end_time - start_time)
                print(f"FPS: {fps:.2f}")
                start_time = time.time()

            # Exit on 'q' press
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    except KeyboardInterrupt:
        print("\nStopping detection loop...")

    finally:
        # Release the capture and destroy all windows
        cap.release()
        cv2.destroyAllWindows()


if __name__ == "__main__":
    run_detector()