feat(stats): added service statistics mechanism

the initial version returns: inference_count predict_success predict_failure predict_count avg_batch_size avg_predict_duration avg_transform_duration
jolibrain · Oct 1, 2020 · 1839e4a · 1839e4a
1 parent 4558ed8
commit 1839e4a
Show file tree

Hide file tree

Showing 17 changed files with 276 additions and 49 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -7,7 +7,12 @@ if (USE_HDF5)
   add_definitions(-DUSE_HDF5)
 endif()
 
-set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc svminputfileconn.h svminputfileconn.cc txtinputfileconn.h txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
+set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h
+    mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h
+    csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc
+    svminputfileconn.h svminputfileconn.cc txtinputfileconn.h
+    txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc
+    service_stats.h service_stats.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
 if (USE_JSON_API)
   list(APPEND ddetect_SOURCES jsonapi.h jsonapi.cc)
 endif()

diff --git a/src/backends/caffe/caffelib.cc b/src/backends/caffe/caffelib.cc
@@ -2810,6 +2810,7 @@ namespace dd
     cad.add("has_mean_file", has_mean_file);
     if (ad_output.has("measure"))
       {
+        // FIXME(sileht): Should we create service_stats here ?
         try
           {
             inputc.transform(cad);
@@ -2842,15 +2843,13 @@ namespace dd
     if (ad.has("chain") && ad.get("chain").get<bool>())
       cad.add("chain", true);
 
-    try
-      {
-        inputc.transform(cad);
-      }
-    catch (std::exception &e)
-      {
-        throw;
-      }
+    this->_stats.transform_start();
+    inputc.transform(cad);
+    this->_stats.transform_end();
+
     int batch_size = inputc.test_batch_size();
+    this->_stats.inc_inference_count(batch_size);
+
     if (ad_mllib.has("net"))
       {
         APIData ad_net = ad_mllib.getobj("net");

diff --git a/src/backends/dlib/dliblib.cc b/src/backends/dlib/dliblib.cc
@@ -141,6 +141,8 @@ namespace dd
     TOutputConnectorStrategy tout(this->_outputc);
     APIData cad = ad;
     cad.add("model_repo", this->_mlmodel._repo);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -149,9 +151,12 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
     int batch_size = inputc.batch_size();
+    this->_stats.inc_inference_count(batch_size);
+
     if (ad_mllib.has("test_batch_size"))
       {
         batch_size = ad_mllib.get("test_batch_size").get<int>();

diff --git a/src/backends/ncnn/ncnnlib.cc b/src/backends/ncnn/ncnnlib.cc
@@ -177,6 +177,8 @@ namespace dd
 
     TInputConnectorStrategy inputc(this->_inputc);
     TOutputConnectorStrategy tout(this->_outputc);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(ad);
@@ -185,6 +187,9 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
+    this->_stats.inc_inference_count(inputc._ids.size());
 
     // if height (timestep) changes we need to clear net before recreating an
     // extractor with new height, and to reload params and models after clear()

diff --git a/src/backends/tensorrt/tensorrtinputconns.cc b/src/backends/tensorrt/tensorrtinputconns.cc
@@ -82,6 +82,7 @@ namespace dd
         _imgs_size.insert(std::pair<std::string, std::pair<int, int>>(
             this->_ids.at(i), this->_images_size.at(i)));
       }
+    _batch_size = this->_images.size();
     _batch_index = 0;
   }
 

diff --git a/src/backends/tensorrt/tensorrtinputconns.h b/src/backends/tensorrt/tensorrtinputconns.h
@@ -90,6 +90,7 @@ namespace dd
     std::string _meanfname = "mean.binaryproto";
     std::string _correspname = "corresp.txt";
     int _batch_index = 0;
+    int _batch_size = 0;
     int process_batch(const unsigned int batch_size);
     std::unordered_map<std::string, std::pair<int, int>>
         _imgs_size; /**< image sizes, used in detection. */

diff --git a/src/backends/tensorrt/tensorrtlib.cc b/src/backends/tensorrt/tensorrtlib.cc
@@ -464,6 +464,7 @@ namespace dd
     APIData cad = ad;
 
     TOutputConnectorStrategy tout(this->_outputc);
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -472,6 +473,9 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
+    this->_stats.inc_inference_count(inputc._batch_size);
 
     int idoffset = 0;
     std::vector<APIData> vrad;

diff --git a/src/backends/tf/tflib.cc b/src/backends/tf/tflib.cc
@@ -329,6 +329,8 @@ namespace dd
     TOutputConnectorStrategy tout;
     APIData cad = ad;
     cad.add("model_repo", this->_mlmodel._repo);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -337,9 +339,11 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
     int batch_size = inputc.batch_size();
+    this->_stats.inc_inference_count(batch_size);
     if (ad_mllib.has("test_batch_size"))
       batch_size = ad_mllib.get("test_batch_size").get<int>();
 

diff --git a/src/backends/torch/torchlib.cc b/src/backends/torch/torchlib.cc
@@ -37,8 +37,8 @@
 #include "generators/net_caffe.h"
 #include "generators/net_caffe_recurrent.h"
 
-#include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <fcntl.h>
 
 #include "native/native.h"
@@ -1020,6 +1020,7 @@ namespace dd
     if (_module._native != nullptr)
       _module._native->update_input_connector(inputc);
 
+    this->_stats.transform_start();
     TOutputConnectorStrategy outputc(this->_outputc);
     try
       {
@@ -1039,6 +1040,8 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
     torch::Device cpu("cpu");
     _module.eval();
 
@@ -1074,6 +1077,8 @@ namespace dd
           {
             in_vals.push_back(tensor.to(_device));
           }
+        this->_stats.inc_inference_count(in_vals.size());
+
         Tensor output;
         try
           {
@@ -1336,4 +1341,4 @@ namespace dd
   template class TorchLib<TxtTorchInputFileConn, SupervisedOutput, TorchModel>;
   template class TorchLib<CSVTSTorchInputFileConn, SupervisedOutput,
                           TorchModel>;
-}
+} // namespace dd
diff --git a/src/backends/xgb/xgblib.cc b/src/backends/xgb/xgblib.cc
@@ -452,6 +452,8 @@ namespace dd
     // data
     TInputConnectorStrategy inputc(this->_inputc);
     APIData cad = ad;
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -460,6 +462,7 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     // load existing model as needed
     if (!_learner)
@@ -503,6 +506,7 @@ namespace dd
     // results
     // float loss = 0.0; // XXX: how to acquire loss ?
     int batch_size = preds.Size();
+    this->_stats.inc_inference_count(batch_size);
     int nclasses = _nclasses;
     if (_objective == "multi:softprob")
       batch_size /= nclasses;

diff --git a/src/jsonapi.cc b/src/jsonapi.cc
@@ -1575,8 +1575,6 @@ namespace dd
     out.toJVal(jpred, jout);
     JVal jhead(rapidjson::kObjectType);
     jhead.AddMember("method", "/chain", jpred.GetAllocator());
-    // jhead.AddMember("service",d["service"],jpred.GetAllocator());
-    // if (!has_measure)
     jhead.AddMember("time", jout["time"], jpred.GetAllocator());
     jpred.AddMember("head", jhead, jpred.GetAllocator());
     JVal jbody(rapidjson::kObjectType);

diff --git a/src/mllibstrategy.h b/src/mllibstrategy.h
@@ -23,6 +23,7 @@
 #define MLLIBSTRATEGY_H
 
 #include "apidata.h"
+#include "service_stats.h"
 #include "utils/fileops.hpp"
 #include "dd_spdlog.h"
 #include <atomic>
@@ -94,7 +95,7 @@ namespace dd
     MLLib(MLLib &&mll) noexcept
         : _inputc(mll._inputc), _outputc(mll._outputc), _mltype(mll._mltype),
           _mlmodel(mll._mlmodel), _meas(mll._meas),
-          _meas_per_iter(mll._meas_per_iter),
+          _meas_per_iter(mll._meas_per_iter), _stats(mll._stats),
           _tjob_running(mll._tjob_running.load()), _logger(mll._logger)
     {
     }
@@ -298,8 +299,9 @@ namespace dd
       int c = 0;
       for (double l : vl)
         {
-          std::string measl = meas + '_' + cnames.at(c); // std::to_string(c);
-          auto hit = _meas.find(measl);
+          std::string measl = meas + '_' + cnames.at(c);
+          auto hit = _meas.find(
+              measl); // not reusing add_meas since need a global lock
           if (hit != _meas.end())
             (*hit).second = l;
           else
@@ -309,7 +311,7 @@ namespace dd
     }
 
     /**
-     * \brief get currentvalue of argument measure
+     * \brief get current value of argument measure
      * @param meas measure name
      * @return current value of measure
      */
@@ -379,15 +381,12 @@ namespace dd
     std::unordered_map<std::string, std::vector<double>>
         _meas_per_iter; /**< model measures per iteration. */
 
+    ServiceStats _stats; /**< service statistics/metrics .*/
+
     std::atomic<bool> _tjob_running = {
       false
     }; /**< whether a training job is running with this lib instance. */
 
-    bool _online
-        = false; /**< whether the algorithm is online, i.e. it interleaves
-                    training and prediction calls. When not, prediction calls
-                    are rejected while training is running. */
-
     std::shared_ptr<spdlog::logger> _logger; /**< mllib logger. */
 
     long int _model_flops = 0;    /**< model flops. */
@@ -398,7 +397,7 @@ namespace dd
   protected:
     mutable std::mutex
         _meas_per_iter_mutex;         /**< mutex over measures history. */
-    mutable std::mutex _meas_mutex;   /** mutex around current measures. */
+    mutable std::mutex _meas_mutex;   /**< mutex around current measures. */
     const int _max_meas_points = 1e7; // 10M points max per measure
   };
 

diff --git a/src/mlservice.h b/src/mlservice.h
@@ -248,6 +248,7 @@ namespace dd
               ad.add("height", this->_inputc.height());
             }
         }
+      this->_stats.to(ad);
       return ad;
     }
 
@@ -283,7 +284,9 @@ namespace dd
                 static_cast<long int>(this->_mem_used_train * sizeof(float)));
       stats.add("data_mem_test",
                 static_cast<long int>(this->_mem_used_test * sizeof(float)));
-      ad.add("stats", stats);
+      ad.add("stats", stats); // FIXME(sileht): deprecated name, delete me when
+                              // platform use the new name
+      ad.add("model_stats", stats);
       ad.add("jobs", vad);
       ad.add("parameters", _init_parameters);
       ad.add("repository", this->_inputc._model_repo);
@@ -292,6 +295,7 @@ namespace dd
         ad.add("type", std::string("unsupervised"));
       else
         ad.add("type", std::string("supervised"));
+      this->_stats.to(ad);
       return ad;
     }
 
@@ -495,36 +499,29 @@ namespace dd
      */
     int predict_job(const APIData &ad, APIData &out, const bool &chain = false)
     {
-      // TODO: collect input transformed data for chain, store it here in
-      // memory
-      // -> beware, the input connector is a copy...
+      if (!_train_mutex.try_lock_shared())
+        throw MLServiceLockException(
+            "Predict call while training with an offline learning algorithm");
 
-      if (!this->_online)
+      this->_stats.predict_start();
+
+      int err = 0;
+      try
         {
-          if (!_train_mutex.try_lock_shared())
-            throw MLServiceLockException("Predict call while training with an "
-                                         "offline learning algorithm");
-          int err = 0;
-          try
-            {
-              if (chain)
-                const_cast<APIData &>(ad).add("chain", true);
-              err = this->predict(ad, out);
-            }
-          catch (std::exception &e)
-            {
-              _train_mutex.unlock_shared();
-              throw;
-            }
-          _train_mutex.unlock_shared();
-          return err;
+          if (chain)
+            const_cast<APIData &>(ad).add("chain", true);
+          err = this->predict(ad, out);
         }
-      else // wait til a lock can be acquired
+      catch (std::exception &e)
         {
-          boost::shared_lock<boost::shared_mutex> lock(_train_mutex);
-          return this->predict(ad, out);
+          _train_mutex.unlock_shared();
+          this->_stats.predict_end(false);
+          throw;
         }
-      return 0;
+      this->_stats.predict_end(true);
+
+      _train_mutex.unlock_shared();
+      return err;
     }
 
     std::string _sname;       /**< service name. */