Skip to content

Commit

Permalink
feat(stats): added service statistics mechanism
Browse files Browse the repository at this point in the history
the initial version returns:

    inference_count
    predict_success
    predict_failure
    predict_count
    avg_batch_size
    avg_predict_duration
    avg_transform_duration
  • Loading branch information
beniz authored and sileht committed Oct 1, 2020
1 parent 4558ed8 commit 1839e4a
Show file tree
Hide file tree
Showing 17 changed files with 276 additions and 49 deletions.
7 changes: 6 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ if (USE_HDF5)
add_definitions(-DUSE_HDF5)
endif()

set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc svminputfileconn.h svminputfileconn.cc txtinputfileconn.h txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h
mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h
csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc
svminputfileconn.h svminputfileconn.cc txtinputfileconn.h
txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc
service_stats.h service_stats.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
if (USE_JSON_API)
list(APPEND ddetect_SOURCES jsonapi.h jsonapi.cc)
endif()
Expand Down
15 changes: 7 additions & 8 deletions src/backends/caffe/caffelib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2810,6 +2810,7 @@ namespace dd
cad.add("has_mean_file", has_mean_file);
if (ad_output.has("measure"))
{
// FIXME(sileht): Should we create service_stats here ?
try
{
inputc.transform(cad);
Expand Down Expand Up @@ -2842,15 +2843,13 @@ namespace dd
if (ad.has("chain") && ad.get("chain").get<bool>())
cad.add("chain", true);

try
{
inputc.transform(cad);
}
catch (std::exception &e)
{
throw;
}
this->_stats.transform_start();
inputc.transform(cad);
this->_stats.transform_end();

int batch_size = inputc.test_batch_size();
this->_stats.inc_inference_count(batch_size);

if (ad_mllib.has("net"))
{
APIData ad_net = ad_mllib.getobj("net");
Expand Down
5 changes: 5 additions & 0 deletions src/backends/dlib/dliblib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ namespace dd
TOutputConnectorStrategy tout(this->_outputc);
APIData cad = ad;
cad.add("model_repo", this->_mlmodel._repo);

this->_stats.transform_start();
try
{
inputc.transform(cad);
Expand All @@ -149,9 +151,12 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
int batch_size = inputc.batch_size();
this->_stats.inc_inference_count(batch_size);

if (ad_mllib.has("test_batch_size"))
{
batch_size = ad_mllib.get("test_batch_size").get<int>();
Expand Down
5 changes: 5 additions & 0 deletions src/backends/ncnn/ncnnlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ namespace dd

TInputConnectorStrategy inputc(this->_inputc);
TOutputConnectorStrategy tout(this->_outputc);

this->_stats.transform_start();
try
{
inputc.transform(ad);
Expand All @@ -185,6 +187,9 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

this->_stats.inc_inference_count(inputc._ids.size());

// if height (timestep) changes we need to clear net before recreating an
// extractor with new height, and to reload params and models after clear()
Expand Down
1 change: 1 addition & 0 deletions src/backends/tensorrt/tensorrtinputconns.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ namespace dd
_imgs_size.insert(std::pair<std::string, std::pair<int, int>>(
this->_ids.at(i), this->_images_size.at(i)));
}
_batch_size = this->_images.size();
_batch_index = 0;
}

Expand Down
1 change: 1 addition & 0 deletions src/backends/tensorrt/tensorrtinputconns.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ namespace dd
std::string _meanfname = "mean.binaryproto";
std::string _correspname = "corresp.txt";
int _batch_index = 0;
int _batch_size = 0;
int process_batch(const unsigned int batch_size);
std::unordered_map<std::string, std::pair<int, int>>
_imgs_size; /**< image sizes, used in detection. */
Expand Down
4 changes: 4 additions & 0 deletions src/backends/tensorrt/tensorrtlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ namespace dd
APIData cad = ad;

TOutputConnectorStrategy tout(this->_outputc);
this->_stats.transform_start();
try
{
inputc.transform(cad);
Expand All @@ -472,6 +473,9 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

this->_stats.inc_inference_count(inputc._batch_size);

int idoffset = 0;
std::vector<APIData> vrad;
Expand Down
4 changes: 4 additions & 0 deletions src/backends/tf/tflib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ namespace dd
TOutputConnectorStrategy tout;
APIData cad = ad;
cad.add("model_repo", this->_mlmodel._repo);

this->_stats.transform_start();
try
{
inputc.transform(cad);
Expand All @@ -337,9 +339,11 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
int batch_size = inputc.batch_size();
this->_stats.inc_inference_count(batch_size);
if (ad_mllib.has("test_batch_size"))
batch_size = ad_mllib.get("test_batch_size").get<int>();

Expand Down
9 changes: 7 additions & 2 deletions src/backends/torch/torchlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
#include "generators/net_caffe.h"
#include "generators/net_caffe_recurrent.h"

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

#include "native/native.h"
Expand Down Expand Up @@ -1020,6 +1020,7 @@ namespace dd
if (_module._native != nullptr)
_module._native->update_input_connector(inputc);

this->_stats.transform_start();
TOutputConnectorStrategy outputc(this->_outputc);
try
{
Expand All @@ -1039,6 +1040,8 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

torch::Device cpu("cpu");
_module.eval();

Expand Down Expand Up @@ -1074,6 +1077,8 @@ namespace dd
{
in_vals.push_back(tensor.to(_device));
}
this->_stats.inc_inference_count(in_vals.size());

Tensor output;
try
{
Expand Down Expand Up @@ -1336,4 +1341,4 @@ namespace dd
template class TorchLib<TxtTorchInputFileConn, SupervisedOutput, TorchModel>;
template class TorchLib<CSVTSTorchInputFileConn, SupervisedOutput,
TorchModel>;
}
} // namespace dd
4 changes: 4 additions & 0 deletions src/backends/xgb/xgblib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,8 @@ namespace dd
// data
TInputConnectorStrategy inputc(this->_inputc);
APIData cad = ad;

this->_stats.transform_start();
try
{
inputc.transform(cad);
Expand All @@ -460,6 +462,7 @@ namespace dd
{
throw;
}
this->_stats.transform_end();

// load existing model as needed
if (!_learner)
Expand Down Expand Up @@ -503,6 +506,7 @@ namespace dd
// results
// float loss = 0.0; // XXX: how to acquire loss ?
int batch_size = preds.Size();
this->_stats.inc_inference_count(batch_size);
int nclasses = _nclasses;
if (_objective == "multi:softprob")
batch_size /= nclasses;
Expand Down
2 changes: 0 additions & 2 deletions src/jsonapi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1575,8 +1575,6 @@ namespace dd
out.toJVal(jpred, jout);
JVal jhead(rapidjson::kObjectType);
jhead.AddMember("method", "/chain", jpred.GetAllocator());
// jhead.AddMember("service",d["service"],jpred.GetAllocator());
// if (!has_measure)
jhead.AddMember("time", jout["time"], jpred.GetAllocator());
jpred.AddMember("head", jhead, jpred.GetAllocator());
JVal jbody(rapidjson::kObjectType);
Expand Down
19 changes: 9 additions & 10 deletions src/mllibstrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define MLLIBSTRATEGY_H

#include "apidata.h"
#include "service_stats.h"
#include "utils/fileops.hpp"
#include "dd_spdlog.h"
#include <atomic>
Expand Down Expand Up @@ -94,7 +95,7 @@ namespace dd
MLLib(MLLib &&mll) noexcept
: _inputc(mll._inputc), _outputc(mll._outputc), _mltype(mll._mltype),
_mlmodel(mll._mlmodel), _meas(mll._meas),
_meas_per_iter(mll._meas_per_iter),
_meas_per_iter(mll._meas_per_iter), _stats(mll._stats),
_tjob_running(mll._tjob_running.load()), _logger(mll._logger)
{
}
Expand Down Expand Up @@ -298,8 +299,9 @@ namespace dd
int c = 0;
for (double l : vl)
{
std::string measl = meas + '_' + cnames.at(c); // std::to_string(c);
auto hit = _meas.find(measl);
std::string measl = meas + '_' + cnames.at(c);
auto hit = _meas.find(
measl); // not reusing add_meas since need a global lock
if (hit != _meas.end())
(*hit).second = l;
else
Expand All @@ -309,7 +311,7 @@ namespace dd
}

/**
* \brief get currentvalue of argument measure
* \brief get current value of argument measure
* @param meas measure name
* @return current value of measure
*/
Expand Down Expand Up @@ -379,15 +381,12 @@ namespace dd
std::unordered_map<std::string, std::vector<double>>
_meas_per_iter; /**< model measures per iteration. */

ServiceStats _stats; /**< service statistics/metrics .*/

std::atomic<bool> _tjob_running = {
false
}; /**< whether a training job is running with this lib instance. */

bool _online
= false; /**< whether the algorithm is online, i.e. it interleaves
training and prediction calls. When not, prediction calls
are rejected while training is running. */

std::shared_ptr<spdlog::logger> _logger; /**< mllib logger. */

long int _model_flops = 0; /**< model flops. */
Expand All @@ -398,7 +397,7 @@ namespace dd
protected:
mutable std::mutex
_meas_per_iter_mutex; /**< mutex over measures history. */
mutable std::mutex _meas_mutex; /** mutex around current measures. */
mutable std::mutex _meas_mutex; /**< mutex around current measures. */
const int _max_meas_points = 1e7; // 10M points max per measure
};

Expand Down
49 changes: 23 additions & 26 deletions src/mlservice.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ namespace dd
ad.add("height", this->_inputc.height());
}
}
this->_stats.to(ad);
return ad;
}

Expand Down Expand Up @@ -283,7 +284,9 @@ namespace dd
static_cast<long int>(this->_mem_used_train * sizeof(float)));
stats.add("data_mem_test",
static_cast<long int>(this->_mem_used_test * sizeof(float)));
ad.add("stats", stats);
ad.add("stats", stats); // FIXME(sileht): deprecated name, delete me when
// platform use the new name
ad.add("model_stats", stats);
ad.add("jobs", vad);
ad.add("parameters", _init_parameters);
ad.add("repository", this->_inputc._model_repo);
Expand All @@ -292,6 +295,7 @@ namespace dd
ad.add("type", std::string("unsupervised"));
else
ad.add("type", std::string("supervised"));
this->_stats.to(ad);
return ad;
}

Expand Down Expand Up @@ -495,36 +499,29 @@ namespace dd
*/
int predict_job(const APIData &ad, APIData &out, const bool &chain = false)
{
// TODO: collect input transformed data for chain, store it here in
// memory
// -> beware, the input connector is a copy...
if (!_train_mutex.try_lock_shared())
throw MLServiceLockException(
"Predict call while training with an offline learning algorithm");

if (!this->_online)
this->_stats.predict_start();

int err = 0;
try
{
if (!_train_mutex.try_lock_shared())
throw MLServiceLockException("Predict call while training with an "
"offline learning algorithm");
int err = 0;
try
{
if (chain)
const_cast<APIData &>(ad).add("chain", true);
err = this->predict(ad, out);
}
catch (std::exception &e)
{
_train_mutex.unlock_shared();
throw;
}
_train_mutex.unlock_shared();
return err;
if (chain)
const_cast<APIData &>(ad).add("chain", true);
err = this->predict(ad, out);
}
else // wait til a lock can be acquired
catch (std::exception &e)
{
boost::shared_lock<boost::shared_mutex> lock(_train_mutex);
return this->predict(ad, out);
_train_mutex.unlock_shared();
this->_stats.predict_end(false);
throw;
}
return 0;
this->_stats.predict_end(true);

_train_mutex.unlock_shared();
return err;
}

std::string _sname; /**< service name. */
Expand Down
Loading

0 comments on commit 1839e4a

Please sign in to comment.