Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better rvalues support for parallel_reduce algorithm #1307

Merged
merged 10 commits into from
Feb 14, 2024
49 changes: 25 additions & 24 deletions include/oneapi/tbb/parallel_reduce.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2005-2023 Intel Corporation
Copyright (c) 2005-2024 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,16 +42,16 @@ concept parallel_reduce_body = splittable<Body> &&

template <typename Function, typename Range, typename Value>
concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
const Range&, const Value&> &&
const Range&, Value&&> &&
std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
const Range&, const Value&>,
const Range&, Value&&>,
Value>;

template <typename Combine, typename Value>
concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
const Value&, const Value&> &&
Value&&, Value&&> &&
std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
const Value&, const Value&>,
Value&&, Value&&>,
Value>;

} // namespace d0
Expand Down Expand Up @@ -390,14 +390,15 @@ class lambda_reduce_body {
, my_value(other.my_identity_element)
{ }
void operator()(Range& range) {
my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value));
}

void join( lambda_reduce_body& rhs ) {
my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
const_cast<const Value&>(rhs.my_value));
my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value));
}
Value result() const {
return my_value;

__TBB_nodiscard Value&& result() && noexcept {
return std::move(my_value);
}
};

Expand Down Expand Up @@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
::run(range, body, __TBB_DEFAULT_PARTITIONER() );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and simple_partitioner.
Expand All @@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
::run(range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and auto_partitioner
Expand All @@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and static_partitioner
Expand All @@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and affinity_partitioner
Expand All @@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, default partitioner and user-supplied context.
Expand All @@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, simple partitioner and user-supplied context.
Expand All @@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, auto_partitioner and user-supplied context
Expand All @@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, static_partitioner and user-supplied context
Expand All @@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
Expand All @@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction and default simple partitioner.
Expand Down Expand Up @@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
::run(range, body, partitioner);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction and static partitioner.
Expand All @@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
::run(range, body, partitioner);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
Expand All @@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
::run(range, body, partitioner, context);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
Expand All @@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
::run(range, body, partitioner, context);
return body.result();
return std::move(body).result();
}
//@}

Expand Down
162 changes: 161 additions & 1 deletion test/conformance/conformance_parallel_reduce.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2005-2023 Intel Corporation
Copyright (c) 2005-2024 Intel Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
#include "common/test_invoke.h"

#include "../tbb/test_partitioner.h"
#include <list>

//! \file conformance_parallel_reduce.cpp
//! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification
Expand Down Expand Up @@ -56,6 +57,59 @@ struct ReduceBody {
}
};

template <typename T>
class MoveOnlyWrapper {
public:
MoveOnlyWrapper() = default;
MoveOnlyWrapper(const T& obj) : my_obj(obj) {}

MoveOnlyWrapper(MoveOnlyWrapper&&) = default;
MoveOnlyWrapper& operator=(MoveOnlyWrapper&&) = default;

MoveOnlyWrapper(const MoveOnlyWrapper&) = delete;
MoveOnlyWrapper& operator=(const MoveOnlyWrapper&) = delete;

bool operator==(const MoveOnlyWrapper& other) const { return my_obj == other.my_obj; }
private:
T my_obj;
}; // class MoveOnlyWrapper

// The container wrapper that is copyable but the copy constructor fails if the source container is non-empty
// If such an empty container is provided as an identity into parallel reduce algorithm with rvalue-friendly body,
// it should only call the copy constructor while broadcasting the identity element into the leafs
// and the identity element is an empty container for the further test
template <typename T>
class EmptyCopyList {
public:
EmptyCopyList() = default;

EmptyCopyList(EmptyCopyList&&) = default;
EmptyCopyList& operator=(EmptyCopyList&&) = default;

EmptyCopyList(const EmptyCopyList& other) {
REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
}
EmptyCopyList& operator=(const EmptyCopyList& other) {
REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
return *this;
}

typename std::list<T>::iterator insert(typename std::list<T>::const_iterator pos, T&& item) {
return my_list.insert(pos, std::move(item));
}

void splice(typename std::list<T>::const_iterator pos, EmptyCopyList&& other) {
my_list.splice(pos, std::move(other.my_list));
}

typename std::list<T>::const_iterator end() const { return my_list.end(); }

bool operator==(const EmptyCopyList& other) const { return my_list == other.my_list; }

private:
std::list<T> my_list;
}; // class EmptyCopyList

template <class Partitioner>
void TestDeterministicReductionFor() {
const int N = 1000;
Expand Down Expand Up @@ -174,3 +228,109 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") {
}

#endif

template <typename Runner, typename... PartitionerContext>
void test_vector_of_lists_rvalue_reduce_basic(const Runner& runner, PartitionerContext&&... args) {
constexpr std::size_t n_vectors = 10000;

using inner_type = MoveOnlyWrapper<int>;
using list_type = EmptyCopyList<inner_type>;
using vector_of_lists_type = std::vector<list_type>;

vector_of_lists_type vector_of_lists;

vector_of_lists.reserve(n_vectors);
for (std::size_t i = 0; i < n_vectors; ++i) {
list_type list;

list.insert(list.end(), inner_type{1});
list.insert(list.end(), inner_type{2});
list.insert(list.end(), inner_type{3});
list.insert(list.end(), inner_type{4});
list.insert(list.end(), inner_type{5});
vector_of_lists.emplace_back(std::move(list));
}

oneapi::tbb::blocked_range<std::size_t> range(0, n_vectors, n_vectors * 2);

auto reduce_body = [&](const decltype(range)& range_obj, list_type&& x) {
list_type new_list = std::move(x);

for (std::size_t index = range_obj.begin(); index != range_obj.end(); ++index) {
new_list.splice(new_list.end(), std::move(vector_of_lists[index]));
}
return new_list;
};

auto join_body = [&](list_type&& x, list_type&& y) {
list_type new_list = std::move(x);

new_list.splice(new_list.end(), std::move(y));
return new_list;
};

list_type result = runner(range, list_type{}, reduce_body, join_body, std::forward<PartitionerContext>(args)...);

list_type expected_result;

for (std::size_t i = 0; i < n_vectors; ++i) {
expected_result.insert(expected_result.end(), inner_type{1});
expected_result.insert(expected_result.end(), inner_type{2});
expected_result.insert(expected_result.end(), inner_type{3});
expected_result.insert(expected_result.end(), inner_type{4});
expected_result.insert(expected_result.end(), inner_type{5});
}

REQUIRE_MESSAGE(expected_result == result, "Incorrect reduce result");
}

struct ReduceRunner {
template <typename... Args>
auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_reduce(std::forward<Args>(args)...)) {
return oneapi::tbb::parallel_reduce(std::forward<Args>(args)...);
}
};

struct DeterministicReduceRunner {
template <typename... Args>
auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...)) {
return oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...);
}
Comment on lines +296 to +298

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the trailing return type helpful versus just leave it to auto (or decltype(auto))?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are forced to use trailing return type, because both auto with automatic deduction and decltype(auto) are C++14 features and we still need to support C++11.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. That's annoying.

};

void test_vector_of_lists_rvalue_reduce() {
ReduceRunner runner;
oneapi::tbb::affinity_partitioner af_partitioner;
oneapi::tbb::task_group_context context;

test_vector_of_lists_rvalue_reduce_basic(runner);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner);

test_vector_of_lists_rvalue_reduce_basic(runner, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner, context);
}

void test_vector_of_lists_rvalue_deterministic_reduce() {
DeterministicReduceRunner runner;
oneapi::tbb::task_group_context context;

test_vector_of_lists_rvalue_reduce_basic(runner);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});

test_vector_of_lists_rvalue_reduce_basic(runner, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
}

//! \brief \ref interface \ref requirement
TEST_CASE("test rvalue optimization") {
test_vector_of_lists_rvalue_reduce();
test_vector_of_lists_rvalue_deterministic_reduce();
}
Loading