feat: implement FaceDetector (#974)

* feat: implement FaceDetectorOptions * feat: implement FaceDetector * build: include FaceDetector * build: include blaze_face_short_range.tflite
homuler · Jul 29, 2023 · c777113 · c777113
1 parent 3eb66c8
commit c777113
Show file tree

Hide file tree

Showing 14 changed files with 395 additions and 0 deletions.
diff --git a/...com.github.homuler.mediapipe/PackageResources/MediaPipe/blaze_face_short_range.bytes.meta b/...com.github.homuler.mediapipe/PackageResources/MediaPipe/blaze_face_short_range.bytes.meta
diff --git a/Packages/com.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision.meta b/Packages/com.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision.meta
diff --git a/...ages/com.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector.meta b/...ages/com.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector.meta
diff --git a/...om.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector/Proto.meta b/...om.github.homuler.mediapipe/Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector/Proto.meta
diff --git a/...Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector/Proto/FaceDetectorGraphOptions.cs.meta b/...Runtime/Scripts/Protobuf/Tasks/Vision/FaceDetector/Proto/FaceDetectorGraphOptions.cs.meta
diff --git a/Packages/com.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector.meta b/Packages/com.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector.meta
diff --git a/...es/com.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetector.cs b/...es/com.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetector.cs
@@ -0,0 +1,189 @@
+// Copyright (c) 2023 homuler
+//
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT.
+
+using System.Collections.Generic;
+
+using FaceDetectionResult = Mediapipe.Tasks.Components.Containers.DetectionResult;
+
+namespace Mediapipe.Tasks.Vision.FaceDetector
+{
+  public sealed class FaceDetector : Core.BaseVisionTaskApi
+  {
+    private const string _DETECTIONS_OUT_STREAM_NAME = "detections";
+    private const string _DETECTIONS_TAG = "DETECTIONS";
+    private const string _NORM_RECT_STREAM_NAME = "norm_rect_in";
+    private const string _NORM_RECT_TAG = "NORM_RECT";
+    private const string _IMAGE_IN_STREAM_NAME = "image_in";
+    private const string _IMAGE_OUT_STREAM_NAME = "image_out";
+    private const string _IMAGE_TAG = "IMAGE";
+    private const string _TASK_GRAPH_NAME = "mediapipe.tasks.vision.face_detector.FaceDetectorGraph";
+
+    private const int _MICRO_SECONDS_PER_MILLISECOND = 1000;
+
+#pragma warning disable IDE0052 // Remove unread private members
+    /// <remarks>
+    ///   keep reference to prevent GC from collecting the callback instance.
+    /// </remarks>
+    private readonly Tasks.Core.TaskRunner.PacketsCallback _packetCallback;
+#pragma warning restore IDE0052
+
+    private FaceDetector(
+      CalculatorGraphConfig graphConfig,
+      Core.RunningMode runningMode,
+      Tasks.Core.TaskRunner.PacketsCallback packetCallback) : base(graphConfig, runningMode, packetCallback)
+    {
+      _packetCallback = packetCallback;
+    }
+
+    public static FaceDetector CreateFromModelPath(string modelPath)
+    {
+      var baseOptions = new Tasks.Core.BaseOptions(modelAssetPath: modelPath);
+      var options = new FaceDetectorOptions(baseOptions, runningMode: Core.RunningMode.IMAGE);
+      return CreateFromOptions(options);
+    }
+
+    public static FaceDetector CreateFromOptions(FaceDetectorOptions options)
+    {
+      var taskInfo = new Tasks.Core.TaskInfo<FaceDetectorOptions>(
+        taskGraph: _TASK_GRAPH_NAME,
+        inputStreams: new List<string> {
+          string.Join(":", _IMAGE_TAG, _IMAGE_IN_STREAM_NAME),
+          string.Join(":", _NORM_RECT_TAG, _NORM_RECT_STREAM_NAME),
+        },
+        outputStreams: new List<string> {
+          string.Join(":", _DETECTIONS_TAG, _DETECTIONS_OUT_STREAM_NAME),
+          string.Join(":", _IMAGE_TAG, _IMAGE_OUT_STREAM_NAME),
+        },
+        taskOptions: options);
+
+      return new FaceDetector(
+        taskInfo.GenerateGraphConfig(options.runningMode == Core.RunningMode.LIVE_STREAM),
+        options.runningMode,
+        BuildPacketsCallback(options.resultCallback));
+    }
+
+    /// <summary>
+    ///   Performs face detection on the provided MediaPipe Image.
+    ///
+    ///   Only use this method when the <see cref="FaceDetector" /> is created with the image running mode.
+    /// </summary>
+    /// <returns>
+    ///   A face detection result object that contains a list of face detections,
+    ///   each detection has a bounding box that is expressed in the unrotated input
+    ///   frame of reference coordinates system, i.e. in `[0,image_width) x [0,
+    ///   image_height)`, which are the dimensions of the underlying image data.
+    /// </returns>
+    public FaceDetectionResult Detect(Image image, Core.ImageProcessingOptions? imageProcessingOptions = null)
+    {
+      var normalizedRect = ConvertToNormalizedRect(imageProcessingOptions, image, roiAllowed: false);
+
+      var packetMap = new PacketMap();
+      packetMap.Emplace(_IMAGE_IN_STREAM_NAME, new ImagePacket(image));
+      packetMap.Emplace(_NORM_RECT_STREAM_NAME, new NormalizedRectPacket(normalizedRect));
+      var outputPackets = ProcessImageData(packetMap);
+
+      var outDetectionsPacket = outputPackets.At<DetectionVectorPacket, List<Detection>>(_DETECTIONS_OUT_STREAM_NAME);
+      if (outDetectionsPacket.IsEmpty())
+      {
+        return new FaceDetectionResult(new List<Components.Containers.Detection>());
+      }
+      return FaceDetectionResult.CreateFrom(outDetectionsPacket.Get());
+    }
+
+    /// <summary>
+    ///   Performs face detection on the provided video frames.
+    ///
+    ///   Only use this method when the FaceDetector is created with the video
+    ///   running mode. It's required to provide the video frame's timestamp (in
+    ///   milliseconds) along with the video frame. The input timestamps should be
+    ///   monotonically increasing for adjacent calls of this method.
+    /// </summary>
+    /// <returns>
+    ///   A face detection result object that contains a list of face detections,
+    ///   each detection has a bounding box that is expressed in the unrotated input
+    ///   frame of reference coordinates system, i.e. in `[0,image_width) x [0,
+    ///   image_height)`, which are the dimensions of the underlying image data.
+    /// </returns>
+    public FaceDetectionResult DetectForVideo(Image image, int timestampMs, Core.ImageProcessingOptions? imageProcessingOptions = null)
+    {
+      var normalizedRect = ConvertToNormalizedRect(imageProcessingOptions, image, roiAllowed: false);
+
+      var packetMap = new PacketMap();
+      var timestamp = new Timestamp(timestampMs * _MICRO_SECONDS_PER_MILLISECOND);
+      packetMap.Emplace(_IMAGE_IN_STREAM_NAME, new ImagePacket(image, timestamp));
+      packetMap.Emplace(_NORM_RECT_STREAM_NAME, new NormalizedRectPacket(normalizedRect).At(timestamp));
+      var outputPackets = ProcessVideoData(packetMap);
+
+      var outDetectionsPacket = outputPackets.At<DetectionVectorPacket, List<Detection>>(_DETECTIONS_OUT_STREAM_NAME);
+      if (outDetectionsPacket.IsEmpty())
+      {
+        return new FaceDetectionResult(new List<Components.Containers.Detection>());
+      }
+      return FaceDetectionResult.CreateFrom(outDetectionsPacket.Get());
+    }
+
+    /// <summary>
+    ///   Sends live image data (an Image with a unique timestamp) to perform face detection.
+    ///
+    ///   Only use this method when the FaceDetector is created with the live stream
+    ///   running mode. The input timestamps should be monotonically increasing for
+    ///   adjacent calls of this method. This method will return immediately after the
+    ///   input image is accepted. The results will be available via the
+    ///   <see cref="FaceDetectorOptions.ResultCallback" /> provided in the <see cref="FaceDetectorOptions" />.
+    ///   The <see cref="DetectAsync" /> method is designed to process live stream data such as camera
+    ///   input. To lower the overall latency, face detector may drop the input
+    ///   images if needed. In other words, it's not guaranteed to have output per
+    ///   input image.
+    public void DetectAsync(Image image, int timestampMs, Core.ImageProcessingOptions? imageProcessingOptions = null)
+    {
+      var normalizedRect = ConvertToNormalizedRect(imageProcessingOptions, image, roiAllowed: false);
+
+      var packetMap = new PacketMap();
+      var timestamp = new Timestamp(timestampMs * _MICRO_SECONDS_PER_MILLISECOND);
+      packetMap.Emplace(_IMAGE_IN_STREAM_NAME, new ImagePacket(image, timestamp));
+      packetMap.Emplace(_NORM_RECT_STREAM_NAME, new NormalizedRectPacket(normalizedRect).At(timestamp));
+
+      SendLiveStreamData(packetMap);
+    }
+
+    private static Tasks.Core.TaskRunner.PacketsCallback BuildPacketsCallback(FaceDetectorOptions.ResultCallback resultCallback)
+    {
+      if (resultCallback == null)
+      {
+        return null;
+      }
+
+      return (PacketMap outputPackets) =>
+      {
+        var outImagePacket = outputPackets.At<ImagePacket, Image>(_IMAGE_OUT_STREAM_NAME);
+        var outDetectionsPacket = outputPackets.At<DetectionVectorPacket, List<Detection>>(_DETECTIONS_OUT_STREAM_NAME);
+        if (outImagePacket == null || outDetectionsPacket == null)
+        {
+          return;
+        }
+
+        if (outImagePacket.IsEmpty())
+        {
+          return;
+        }
+        var image = outImagePacket.Get();
+        var timestamp = outImagePacket.Timestamp().Microseconds() / _MICRO_SECONDS_PER_MILLISECOND;
+
+        if (outDetectionsPacket.IsEmpty())
+        {
+          resultCallback(
+            new FaceDetectionResult(new List<Components.Containers.Detection>()),
+            image,
+            (int)timestamp);
+          return;
+        }
+
+        var detectionProtoList = outDetectionsPacket.Get();
+        resultCallback(FaceDetectionResult.CreateFrom(detectionProtoList), image, (int)timestamp);
+      };
+    }
+  }
+}
diff --git a/...m.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetector.cs.meta b/...m.github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetector.cs.meta
diff --git a/...github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetectorOptions.cs b/...github.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetectorOptions.cs
@@ -0,0 +1,66 @@
+// Copyright (c) 2023 homuler
+//
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT.
+
+namespace Mediapipe.Tasks.Vision.FaceDetector
+{
+  public sealed class FaceDetectorOptions : Tasks.Core.ITaskOptions
+  {
+    /// <param name="detectionResult">
+    ///   face detection result object that contains a list of face detections,
+    ///   each detection has a bounding box that is expressed in the unrotated
+    ///   input frame of reference coordinates system,
+    ///   i.e. in `[0,image_width) x [0,image_height)`, which are the dimensions
+    ///   of the underlying image data.
+    /// </param>
+    /// <param name="image">
+    ///   The input image that the face detector runs on.
+    /// </param>
+    /// <param name="timestampMs">
+    ///   The input timestamp in milliseconds.
+    /// </param>
+    public delegate void ResultCallback(Components.Containers.DetectionResult detectionResult, Image image, int timestampMs);
+
+    public Tasks.Core.BaseOptions baseOptions { get; }
+    public Core.RunningMode runningMode { get; }
+    public float minDetectionConfidence { get; } = 0.5f;
+    public float minSuppressionThreshold { get; } = 0.3f;
+    public ResultCallback resultCallback { get; }
+
+    public FaceDetectorOptions(
+      Tasks.Core.BaseOptions baseOptions,
+      Core.RunningMode runningMode = Core.RunningMode.IMAGE,
+      float minDetectionConfidence = 0.5f,
+      float minSuppressionThreshold = 0.3f,
+      ResultCallback resultCallback = null)
+    {
+      this.baseOptions = baseOptions;
+      this.runningMode = runningMode;
+      this.minDetectionConfidence = minDetectionConfidence;
+      this.minSuppressionThreshold = minSuppressionThreshold;
+      this.resultCallback = resultCallback;
+    }
+
+    internal Proto.FaceDetectorGraphOptions ToProto()
+    {
+      var baseOptionsProto = baseOptions.ToProto();
+      baseOptionsProto.UseStreamMode = runningMode != Core.RunningMode.IMAGE;
+
+      return new Proto.FaceDetectorGraphOptions
+      {
+        BaseOptions = baseOptionsProto,
+        MinDetectionConfidence = minDetectionConfidence,
+        MinSuppressionThreshold = minSuppressionThreshold,
+      };
+    }
+
+    CalculatorOptions Tasks.Core.ITaskOptions.ToCalculatorOptions()
+    {
+      var options = new CalculatorOptions();
+      options.SetExtension(Proto.FaceDetectorGraphOptions.Extensions.Ext, ToProto());
+      return options;
+    }
+  }
+}
diff --git a/...b.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetectorOptions.cs.meta b/...b.homuler.mediapipe/Runtime/Scripts/Tasks/Vision/FaceDetector/FaceDetectorOptions.cs.meta
diff --git a/mediapipe_api/BUILD b/mediapipe_api/BUILD
@@ -9,6 +9,7 @@ load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "string_list_flag")
 load("@rules_pkg//:pkg.bzl", "pkg_zip")
 load("@build_bazel_apple_support//rules:universal_binary.bzl", "universal_binary")
+load("@com_google_mediapipe//mediapipe/framework/tool:mediapipe_files.bzl", "mediapipe_files")
 
 bool_flag(
     name = "macos_universal",
@@ -308,6 +309,7 @@ cc_library(
     name = "face_detection_calculators",
     deps = [
         "@com_google_mediapipe//mediapipe/framework/tool:switch_mux_calculator",
+        "@com_google_mediapipe//mediapipe/tasks/cc/vision/face_detector:face_detector",
     ] + select({
         "@com_google_mediapipe//mediapipe/gpu:disable_gpu": [
             "@com_google_mediapipe//mediapipe/graphs/face_detection:desktop_live_calculators",
@@ -426,6 +428,10 @@ pkg_zip(
     ],
 )
 
+mediapipe_files(srcs = [
+    "blaze_face_short_range.tflite",
+])
+
 pkg_asset(
     name = "mediapipe_assets",
     srcs = select({
@@ -465,6 +471,7 @@ pkg_asset(
 filegroup(
     name = "face_detection_assets",
     srcs = [
+        ":blaze_face_short_range.tflite",
         "@com_google_mediapipe//mediapipe/modules/face_detection:face_detection_full_range_sparse.tflite",
         "@com_google_mediapipe//mediapipe/modules/face_detection:face_detection_full_range.tflite",
         "@com_google_mediapipe//mediapipe/modules/face_detection:face_detection_short_range.tflite",
@@ -557,6 +564,7 @@ pkg_zip(
         "//mediapipe_api/modules/face_geometry/protos:proto_srcs",
         "//mediapipe_api/modules/holistic_landmark/calculators:proto_srcs",
         "//mediapipe_api/tasks/cc/core/proto:proto_srcs",
+        "//mediapipe_api/tasks/cc/vision/face_detector/proto:proto_srcs",
         "//mediapipe_api/util:proto_srcs",
         "//mediapipe_api/util/tracking:proto_srcs",
     ],