add MoveNet pose detection

ziyuan-linn · ziyuan-linn · commit 176a2ad21758 · 2023-07-21T19:18:18.000-04:00
updated pose detection model from posenet to MoveNet
diff --git a/examples/PoseDetection/index.html b/examples/PoseDetection/index.html
@@ -9,18 +9,11 @@
   <head>
     <meta charset="UTF-8" />
     <title>PoseNet example using p5.js</title>
-
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/0.6.0/p5.min.js"></script>
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/0.6.0/addons/p5.dom.min.js"></script>
-    <script
-      src="https://unpkg.com/ml5@0.1.1/dist/ml5.min.js"
-      type="text/javascript"
-    ></script>
-    <link rel="stylesheet" type="text/css" href="style.css" />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.6.0/p5.js"></script>
+    <script src="../../dist/ml5.js"></script>
   </head>
 
   <body>
-    <p id="status">Loading model...</p>
     <script src="sketch.js"></script>
   </body>
 </html>
diff --git a/examples/PoseDetection/sketch.js b/examples/PoseDetection/sketch.js
@@ -5,7 +5,7 @@
 
 /* ===
 ml5 Example
-PoseNet example using p5.js
+poseDetection example using p5.js
 === */
 
 let video;
@@ -18,7 +18,7 @@ function setup() {
   video.size(width, height);
 
   // Create a new poseNet method with a single detection
-  poseNet = ml5.poseNet(video, modelReady);
+  poseNet = ml5.poseDetection(video, modelReady);
   // This sets up an event that fills the global variable "poses"
   // with an array every time new poses are detected
   poseNet.on("pose", function (results) {
@@ -29,31 +29,31 @@ function setup() {
 }
 
 function modelReady() {
-  select("#status").html("Model Loaded");
+  console.log("Model Loaded!");
 }
 
 function draw() {
   image(video, 0, 0, width, height);
-
+  //console.log(poses);
   // We can call both functions to draw all keypoints and the skeletons
   drawKeypoints();
-  drawSkeleton();
+  //drawSkeleton();
 }
 
 // A function to draw ellipses over the detected keypoints
 function drawKeypoints() {
   // Loop through all the poses detected
   for (let i = 0; i < poses.length; i++) {
     // For each pose detected, loop through all the keypoints
-    let pose = poses[i].pose;
+    let pose = poses[i];
     for (let j = 0; j < pose.keypoints.length; j++) {
       // A keypoint is an object describing a body part (like rightArm or leftShoulder)
       let keypoint = pose.keypoints[j];
       // Only draw an ellipse is the pose probability is bigger than 0.2
       if (keypoint.score > 0.2) {
         fill(255, 0, 0);
         noStroke();
-        ellipse(keypoint.position.x, keypoint.position.y, 10, 10);
+        ellipse(keypoint.x, keypoint.y, 10, 10);
       }
     }
   }
diff --git a/package.json b/package.json
@@ -27,7 +27,9 @@
   },
   "dependencies": {
     "@mediapipe/hands": "^0.4.1675469240",
+    "@mediapipe/pose": "^0.5.1675469404",
     "@tensorflow-models/hand-pose-detection": "^2.0.0",
+    "@tensorflow-models/pose-detection": "^2.1.0",
     "@tensorflow/tfjs": "^4.2.0",
     "@tensorflow/tfjs-vis": "^1.5.1",
     "axios": "^1.3.4"
diff --git a/src/PoseDetection/index.js b/src/PoseDetection/index.js
@@ -10,183 +10,117 @@ Ported from pose-detection at Tensorflow.js
 
 import EventEmitter from "events";
 import * as tf from "@tensorflow/tfjs";
-import * as posenet from "@tensorflow-models/posenet";
+import * as bodyPoseDetection from "@tensorflow-models/pose-detection";
 import callCallback from "../utils/callcallback";
 import handleArguments from "../utils/handleArguments";
+import { mediaReady } from "../utils/imageUtilities";
 
-const DEFAULTS = {
-  architecture: "MobileNetV1", // 'MobileNetV1', 'ResNet50'
-  outputStride: 16, // 8, 16, 32
-  flipHorizontal: false, // true, false
-  minConfidence: 0.5,
-  maxPoseDetections: 5, // any number > 1
-  scoreThreshold: 0.5,
-  nmsRadius: 20, // any number > 0
-  detectionType: "multiple", // 'single'
-  inputResolution: 256, // or { width: 257, height: 200 }
-  multiplier: 0.75, // 1.01, 1.0, 0.75, or 0.50 -- only for MobileNet
-  quantBytes: 2, // 4, 2, 1
-  modelUrl: null, // url path to model
-};
-
-class PoseNet extends EventEmitter {
+class PoseDetection extends EventEmitter {
   /**
    * @typedef {Object} options
-   * @property {string} architecture - default 'MobileNetV1',
-   * @property {number} inputResolution - default 257,
-   * @property {number} outputStride - default 16
-   * @property {boolean} flipHorizontal - default false
-   * @property {number} minConfidence - default 0.5
-   * @property {number} maxPoseDetections - default 5
-   * @property {number} scoreThreshold - default 0.5
-   * @property {number} nmsRadius - default 20
-   * @property {String} detectionType - default single
-   * @property {number} nmsRadius - default 0.75,
-   * @property {number} quantBytes - default 2,
-   * @property {string} modelUrl - default null
+   * @property {string} modelType - Optional. specify what model variant to load from. Default: 'SINGLEPOSE_LIGHTNING'.
+   * @property {boolean} enableSmoothing - Optional. Whether to use temporal filter to smooth keypoints across frames. Default: true.
+   * @property {string} modelUrl - Optional. A string that specifies custom url of the model. Default to load from tf.hub.
+   * @property {number} minPoseScore - Optional. The minimum confidence score for a pose to be detected. Default: 0.25.
+   * @property {number} multiPoseMaxDimension - Optional. The target maximum dimension to use as the input to the multi-pose model. Must be a mutiple of 32. Default: 256.
+   * @property {boolean} enableTracking - Optional. Track each person across the frame with a unique ID. Default: true.
+   * @property {string} trackerType - Optional. Specify what type of tracker to use. Default: 'boundingBox'.
+   * @property {Object} trackerConfig - Optional. Specify tracker configurations. Use tf.js setting by default.
    */
+
   /**
    * Create a PoseNet model.
    * @param {HTMLVideoElement || p5.Video} video  - Optional. A HTML video element or a p5 video element.
    * @param {options} options - Optional. An object describing a model accuracy and performance.
-   * @param {String} detectionType - Optional. A String value to run 'single' or 'multiple' estimation.
    * @param {function} callback  Optional. A function to run once the model has been loaded.
    *    If no callback is provided, it will return a promise that will be resolved once the
    *    model has loaded.
    */
-  constructor(video, options, detectionType, callback) {
+  constructor(video, options, callback) {
     super();
+
     this.video = video;
-    /**
-     * The type of detection. 'single' or 'multiple'
-     * @type {String}
-     * @public
-     */
-    this.modelUrl = options.modelUrl || null;
-    this.architecture = options.architecture || DEFAULTS.architecture;
-    this.detectionType =
-      detectionType || options.detectionType || DEFAULTS.detectionType;
-    this.outputStride = options.outputStride || DEFAULTS.outputStride;
-    this.flipHorizontal = options.flipHorizontal || DEFAULTS.flipHorizontal;
-    this.scoreThreshold = options.scoreThreshold || DEFAULTS.scoreThreshold;
-    this.minConfidence = options.minConfidence || DEFAULTS.minConfidence;
-    this.maxPoseDetections =
-      options.maxPoseDetections || DEFAULTS.maxPoseDetections;
-    this.multiplier = options.multiplier || DEFAULTS.multiplier;
-    this.inputResolution = options.inputResolution || DEFAULTS.inputResolution;
-    this.quantBytes = options.quantBytes || DEFAULTS.quantBytes;
-    this.nmsRadius = options.nmsRadius || DEFAULTS.nmsRadius;
-    this.ready = callCallback(this.load(), callback);
-    // this.then = this.ready.then;
+    this.model = null;
+    this.modelReady = false;
+    this.config = options;
+
+    this.ready = callCallback(this.loadModel(), callback);
   }
 
-  async load() {
-    let modelJson;
-    if (this.architecture.toLowerCase() === "mobilenetv1") {
-      modelJson = {
-        architecture: this.architecture,
-        outputStride: this.outputStride,
-        inputResolution: this.inputResolution,
-        multiplier: this.multiplier,
-        quantBytes: this.quantBytes,
-        modelUrl: this.modelUrl,
-      };
-    } else {
-      modelJson = {
-        architecture: this.architecture,
-        outputStride: this.outputStride,
-        inputResolution: this.inputResolution,
-        quantBytes: this.quantBytes,
-      };
+  /**
+   * Load the model and set it to this.model
+   * @return {this} the detector model.
+   */
+  async loadModel() {
+    const pipeline = bodyPoseDetection.SupportedModels.MoveNet;
+    //Set the config to user defined or default values
+    const modelConfig = {
+      enableSmoothing: this.config.enableSmoothing ?? true,
+      modelUrl: this.config.modelUrl,
+      minPoseScore: this.config.minPoseScore ?? 0.25,
+      multiPoseMaxDimension: this.config.multiPoseMaxDimension ?? 256,
+      enableTracking: this.config.enableTracking ?? true,
+      trackerType: this.config.trackerType ?? "boundingBox",
+      trackerConfig: this.config.trackerConfig,
+    };
+    switch (this.config.modelType) {
+      case "SINGLEPOSE_LIGHTNING":
+        modelConfig.modelType =
+          bodyPoseDetection.movenet.modelType.SINGLEPOSE_LIGHTNING;
+        break;
+      case "SINGLEPOSE_THUNDER":
+        modelConfig.modelType =
+          bodyPoseDetection.movenet.modelType.SINGLEPOSE_THUNDER;
+      case "MULTIPOSE_LIGHTNING":
+        modelConfig.modelType =
+          bodyPoseDetection.movenet.modelType.MULTIPOSE_LIGHTNING;
     }
-
-    this.net = await posenet.load(modelJson);
+    // Load the detector model
+    await tf.setBackend("webgl");
+    this.model = await bodyPoseDetection.createDetector(pipeline, modelConfig);
+    this.modelReady = true;
 
     if (this.video) {
-      if (this.video.readyState === 0) {
-        await new Promise((resolve) => {
-          this.video.onloadeddata = () => resolve();
-        });
-      }
-      if (this.detectionType === "single") {
-        this.singlePose();
-      } else {
-        this.multiPose();
-      }
+      this.predict();
     }
-    return this;
-  }
 
-  skeleton(keypoints, confidence = this.minConfidence) {
-    return posenet.getAdjacentKeyPoints(keypoints, confidence);
+    return this;
   }
 
-  // eslint-disable-next-line class-methods-use-this
-  mapParts(pose) {
-    const newPose = JSON.parse(JSON.stringify(pose));
-    newPose.keypoints.forEach((keypoint) => {
-      newPose[keypoint.part] = {
-        x: keypoint.position.x,
-        y: keypoint.position.y,
-        confidence: keypoint.score,
-      };
-    });
-    return newPose;
-  }
+  //Add named keypoints to a MoveNet pose object
+  // mapParts(pose) {
+  //   const newPose = JSON.parse(JSON.stringify(pose));
+  //   newPose.keypoints.forEach((keypoint) => {
+  //     newPose[keypoint.part] = {
+  //       x: keypoint.position.x,
+  //       y: keypoint.position.y,
+  //       confidence: keypoint.score,
+  //     };
+  //   });
+  //   return newPose;
+  // }
 
   /**
    * Given an image or video, returns an array of objects containing pose estimations
-   *    using single or multi-pose detection.
-   * @param {HTMLVideoElement || p5.Video || function} inputOr
-   * @param {function} cb
+   * @param {HTMLVideoElement || p5.Video || function} inputOr - An HMTL or p5.js image, video, or canvas element to run the prediction on.
+   * @param {function} cb - A callback function to handle the predictions.
    */
-  async singlePose(inputOr, cb) {
-    const { image: input, callback } = handleArguments(this.video, inputOr, cb);
-
-    const pose = await this.net.estimateSinglePose(input, {
-      flipHorizontal: this.flipHorizontal,
-    });
-    const poseWithParts = this.mapParts(pose);
-    const result = [
-      { pose: poseWithParts, skeleton: this.skeleton(pose.keypoints) },
-    ];
-    this.emit("pose", result);
-
-    if (this.video) {
-      return tf.nextFrame().then(() => this.singlePose());
-    }
-
-    if (typeof callback === "function") {
-      callback(result);
+  async predict(inputOr, cb) {
+    const { image, callback } = handleArguments(this.video, inputOr, cb);
+    if (!image) {
+      throw new Error("No input image found.");
     }
+    // If video is provided, wait for video to be loaded
+    await mediaReady(image, false);
+    const result = await this.model.estimatePoses(image);
 
-    return result;
-  }
+    //Add named keypoints to each pose object
+    //const result = poses.map((pose) => this.mapParts(pose));
 
-  /**
-   * Given an image or video, returns an array of objects containing pose
-   *    estimations using single or multi-pose detection.
-   * @param {HTMLVideoElement || p5.Video || function} inputOr
-   * @param {function} cb
-   */
-  async multiPose(inputOr, cb) {
-    const { image: input, callback } = handleArguments(this.video, inputOr, cb);
-
-    const poses = await this.net.estimateMultiplePoses(input, {
-      flipHorizontal: this.flipHorizontal,
-      maxDetections: this.maxPoseDetections,
-      scoreThreshold: this.scoreThreshold,
-      nmsRadius: this.nmsRadius,
-    });
-
-    const posesWithParts = poses.map((pose) => this.mapParts(pose));
-    const result = posesWithParts.map((pose) => ({
-      pose,
-      skeleton: this.skeleton(pose.keypoints),
-    }));
     this.emit("pose", result);
+
     if (this.video) {
-      return tf.nextFrame().then(() => this.multiPose());
+      return tf.nextFrame().then(() => this.predict());
     }
 
     if (typeof callback === "function") {
@@ -198,13 +132,8 @@ class PoseNet extends EventEmitter {
 }
 
 const poseDetection = (...inputs) => {
-  const {
-    video,
-    options = {},
-    callback,
-    string: detectionType,
-  } = handleArguments(...inputs);
-  return new PoseNet(video, options, detectionType, callback);
+  const { video, options = {}, callback } = handleArguments(...inputs);
+  return new PoseDetection(video, options, callback);
 };
 
 export default poseDetection;
diff --git a/yarn.lock b/yarn.lock