ml5js · yiyujin · Jun 26, 2025 · Jul 2, 2025 · Jul 10, 2025 · Jul 11, 2025
diff --git a/examples/objectDetection/index.html b/examples/objectDetection/index.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>ml5.js objectDetector Webcam Example</title>
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/lib/p5.js"></script>
+    <script src="../../dist/ml5.js"></script>
+  </head>
+  <body>
+    <main>
+    </main>
+    <script src="sketch.js"></script>
+  </body>
+</html>
diff --git a/examples/objectDetection/sketch.js b/examples/objectDetection/sketch.js
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 ml5
+//
+// This software is released under the MIT License.
+// https://opensource.org/licenses/MIT
+
+/* ===
+ml5 Example
+Object Detection using COCOSSD
+This example uses a callback pattern to create the classifier
+=== */
+
+let video;
+let detector;
+let detections = [];
+
+function preload(){
+  detector = ml5.objectDetector("cocossd");
+}
+
+function setup() {
+  createCanvas(640, 480);
+
+  video = createCapture(VIDEO);
+  video.size(width, height);
+  video.hide();
+
+  detector.detectStart(video, gotDetections);
+}
+
+function gotDetections(results) {
+  detections = results;
+}
+
+function draw() {
+  image(video, 0, 0);
+
+  for (let i = 0; i < detections.length; i += 1) {
+    const detection = detections[i];
-    const detection = detections[i];
+    let detection = detections[i];
-    const detection = detections[i];
+    let detection = detections[i];
+
+    // draw bounding box
+    stroke(0, 255, 0);
+    strokeWeight(4);
+    noFill();
+    rect(detection.x, detection.y, detection.width, detection.height);
+
+    // draw label
+    noStroke();
+    fill(255);
+    textSize(24);
+    text(detection.label, detection.x + 10, detection.y + 24);
+  }
+}
diff --git a/package.json b/package.json
@@ -56,6 +56,7 @@
     "@mediapipe/pose": "^0.5.1675469404",
     "@mediapipe/selfie_segmentation": "~0.1.0",
     "@tensorflow-models/body-segmentation": "^1.0.1",
+    "@tensorflow-models/coco-ssd": "^2.2.3",
     "@tensorflow-models/face-landmarks-detection": "1.0.5",
     "@tensorflow-models/hand-pose-detection": "^2.0.0",
     "@tensorflow-models/mobilenet": "^2.1.0",

diff --git a/src/ObjectDetector/cocossd.js b/src/ObjectDetector/cocossd.js
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 ml5
+//
+// This software is released under the MIT License.
+// https://opensource.org/licenses/MIT
+
+/*
+    COCO-SSD Object detection model
+    Wraps the coco-ssd model in tfjs to be used in ml5
+*/
+import * as tf from "@tensorflow/tfjs";
+import * as cocoSsd from "@tensorflow-models/coco-ssd";
+import { mediaReady } from "../utils/imageUtilities";
+
+const DEFAULTS = {
+  base: "lite_mobilenet_v2",
+  modelUrl: undefined,
+};
+
+export class CocoSsd {
+  constructor(options = {}) {
+    this.model = null;
+    this.config = {
+      base: options.base || DEFAULTS.base,
+      modelUrl: options.modelUrl || DEFAULTS.modelUrl,
+    };
+  }
+
+  async load() {
+    await tf.setBackend("webgl"); // this line resolves warning : performance is poor on webgpu backend
+    await tf.ready();
+
+    this.model = await cocoSsd.load(this.config);
+    return this;
+  }
+
+  /**
+   * Detect objects that are in the image/video/canvas
+   * @param {HTMLVideoElement|HTMLImageElement|HTMLCanvasElement|ImageData} imgToPredict - Subject of the detection.
+   * @returns {Array} Array of detection detections
+   */
+  async detect(imgToPredict) {
+    mediaReady(imgToPredict, true);
+    await tf.nextFrame();
+
+    const detections = await this.model.detect(imgToPredict);
+    const formattedDetections = detections.map(prediction => {
+      return {
+        label: prediction.class,
+        confidence: prediction.score,
+        x: prediction.bbox[0],
+        y: prediction.bbox[1],
+        width: prediction.bbox[2],
+        height: prediction.bbox[3],
+        normalized: {
+          x: prediction.bbox[0] / imgToPredict.width,
+          y: prediction.bbox[1] / imgToPredict.height,
+          width: prediction.bbox[2] / imgToPredict.width,
+          height: prediction.bbox[3] / imgToPredict.height,
+        },
+      };
+    });
+
+    return formattedDetections;
+  }
+}
+
+export async function load(modelConfig = {}) {
+  const cocoSsdInstance = new CocoSsd(modelConfig);
+  await cocoSsdInstance.load();
+  return cocoSsdInstance;
+}
diff --git a/src/ObjectDetector/index.js b/src/ObjectDetector/index.js
@@ -0,0 +1,156 @@
+// Copyright (c) 2019 ml5
+//
+// This software is released under the MIT License.
+// https://opensource.org/licenses/MIT
+
+/*
+  ObjectDetection
+*/
+
+import * as cocoSsd from "./cocossd.js";
+import { handleModelName } from "../utils/handleOptions";
+import handleArguments from "../utils/handleArguments";
+import callCallback from "../utils/callcallback";
+import { mediaReady } from "../utils/imageUtilities";
+
+const MODEL_OPTIONS = ["cocossd"]; // Expandable for other models like YOLO
+
+class ObjectDetector {
+  /**
+   * @typedef {Object} options
+   * @property {number} filterBoxesThreshold - Optional. default 0.01
+   * @property {number} IOUThreshold - Optional. default 0.4
+   * @property {number} classProbThreshold - Optional. default 0.4
+   */
+  /**
+   * Create ObjectDetector model. Works on video and images.
+   * @param {string} modelNameOrUrl - The name or the URL of the model to use. Current model name options
+   *    are: 'YOLO' and 'CocoSsd'.
+   * @param {Object} options - Optional. A set of options.
+   * @param {function} callback - Optional. A callback function that is called once the model has loaded.
+   */
+  constructor(modelNameOrUrl, options = {}, callback) {
+    this.model = null;
+    this.modelName = null;
+    this.modelToUse = null;
+
+    // flags for detectStart() and detectStop()
+    this.isDetecting = false;
+    this.signalStop = false;
+    this.prevCall = "";
+
+    this.modelName = handleModelName(
+      modelNameOrUrl,
+      MODEL_OPTIONS,
+      "cocossd",
+      "objectDetector"
+    );
+
+
+    switch (this.modelName) {
+      case "cocossd":
+        this.modelToUse = cocoSsd;
+        break;
+      case "yolo":
+        this.modelToUse = yolo;
+        break;
+      // more models... currently only cocossd is supported
+      default:
+        console.warn(`Unknown model: ${this.modelName}, defaulting to CocoSsd`);
+        this.modelToUse = cocoSsd;
+    }
+
+    // load model and assign ready promise
+    this.ready = callCallback(this.loadModel(options), callback);
+  }
+
+  async loadModel(options) {
+    if (!this.modelToUse || !this.modelToUse.load) {
+      throw new Error(`Model loader is missing or invalid for: ${this.modelName}`);
+    }
+
+    this.model = await this.modelToUse.load(options);
+
+    return this;
+  }
+
+  /**
+   * @typedef {Object} ObjectDetectorPrediction
+   * @property {number} x - top left x coordinate of the prediction box in pixels.
+   * @property {number} y - top left y coordinate of the prediction box in pixels.
+   * @property {number} width - width of the prediction box in pixels.
+   * @property {number} height - height of the prediction box in pixels.
+   * @property {string} label - the label given.
+   * @property {number} confidence - the confidence score (0 to 1).
+   * @property {ObjectDetectorPredictionNormalized} normalized - a normalized object of the predicition
+   */
+
+  /**
+   * @typedef {Object} ObjectDetectorPredictionNormalized
+   * @property {number} x - top left x coordinate of the prediction box (0 to 1).
+   * @property {number} y - top left y coordinate of the prediction box (0 to 1).
+   * @property {number} width - width of the prediction box (0 to 1).
+   * @property {number} height - height of the prediction box (0 to 1).
+   */
+
+  /**
+   * Detect objects once from the input image/video/canvas.
+   * @param {HTMLVideoElement|HTMLImageElement|HTMLCanvasElement|ImageData} input - Target element.
+   * @param {function} cb - Optional callback.
+   * @returns {ObjectDetectorPrediction}
+   */
+  async detect(input, cb) {
+    const args = handleArguments(input, cb).require("image", "No valid image input.");
+    await this.ready;
+    return callCallback(this.model.detect(args.image), args.callback);
+  }
+
+  /**
+   * Start continuous detection on video/canvas input
+   * @param {HTMLVideoElement|HTMLImageElement|HTMLCanvasElement|ImageData} input - Target element.
+   * @param {function} callback - Callback function called with each detection result.
+   */
+  async detectStart(input, callback) {
+    const args = handleArguments(input, callback).require("image", "No input provided.");
+
+    const detectFrame = async () => {
+      await mediaReady(args.image, true);
+      await callCallback(this.model.detect(args.image), args.callback);
+
+      if (!this.signalStop) {
+        requestAnimationFrame(detectFrame);
+      } else {
+        this.isDetecting = false;
+      }
+    };
+
+    this.signalStop = false;
+    if (!this.isDetecting) {
+      this.isDetecting = true;
+      detectFrame();
+    }
+
+    if (this.prevCall === "start") {
+      console.warn(
+        "detectStart() called again without detectStop(). Only the latest call is running."
+      );
+    }
+
+    this.prevCall = "start";
+  }
+
+  detectStop() {
+    if (this.isDetecting) {
+      this.signalStop = true;
+    }
+    this.prevCall = "stop";
+  }
+}
+
+const objectDetector = (modelNameOrUrl, optionsOrCallback, cb) => {
+  const { string, options = {}, callback } = handleArguments(modelNameOrUrl, optionsOrCallback, cb);
+  const instance = new ObjectDetector(string, options, callback);
+  return instance;
+};
+
+export default objectDetector;
diff --git a/src/index.js b/src/index.js
@@ -5,6 +5,7 @@ import faceMesh from "./FaceMesh";
 import bodyPose from "./BodyPose";
 import imageClassifier from "./ImageClassifier";
 import soundClassifier from "./SoundClassifier";
+import objectDetector from "./ObjectDetector";
 import setBackend from "./utils/setBackend";
 import bodySegmentation from "./BodySegmentation";
 import communityStatement from "./utils/communityStatement";
@@ -22,6 +23,7 @@ const withPreload = {
   neuralNetwork,
   sentiment,
   soundClassifier,
+  objectDetector
 };
 
 const ml5 = Object.assign({ p5Utils }, withPreload, {
@@ -36,4 +38,4 @@ p5Utils.shouldPreload(ml5, Object.keys(withPreload));
 
 communityStatement();
 
-export default ml5;
+export default ml5;
diff --git a/yarn.lock b/yarn.lock
@@ -1731,6 +1731,16 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@tensorflow-models/coco-ssd@npm:^2.2.3":
+  version: 2.2.3
+  resolution: "@tensorflow-models/coco-ssd@npm:2.2.3"
+  peerDependencies:
+    "@tensorflow/tfjs-converter": ^4.10.0
+    "@tensorflow/tfjs-core": ^4.10.0
+  checksum: 10c0/3d0e54d433e388439a461f9e7b4995bcfab7825d49eaf587818800549c54adc8192dab1b5a82e0ef48d87065ce155261a3ce934000477ac0da65de6939568e2e
+  languageName: node
+  linkType: hard
+
 "@tensorflow-models/face-landmarks-detection@npm:1.0.5":
   version: 1.0.5
   resolution: "@tensorflow-models/face-landmarks-detection@npm:1.0.5"
@@ -6537,6 +6547,7 @@ __metadata:
     "@mediapipe/pose": "npm:^0.5.1675469404"
     "@mediapipe/selfie_segmentation": "npm:~0.1.0"
     "@tensorflow-models/body-segmentation": "npm:^1.0.1"
+    "@tensorflow-models/coco-ssd": "npm:^2.2.3"
     "@tensorflow-models/face-landmarks-detection": "npm:1.0.5"
     "@tensorflow-models/hand-pose-detection": "npm:^2.0.0"
     "@tensorflow-models/mobilenet": "npm:^2.1.0"