Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better rvalues support for parallel_reduce algorithm #1307

Merged
merged 10 commits into from
Feb 14, 2024
49 changes: 25 additions & 24 deletions include/oneapi/tbb/parallel_reduce.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2005-2023 Intel Corporation
Copyright (c) 2005-2024 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,16 +42,16 @@ concept parallel_reduce_body = splittable<Body> &&

template <typename Function, typename Range, typename Value>
concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
const Range&, const Value&> &&
const Range&, Value&&> &&
std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
const Range&, const Value&>,
const Range&, Value&&>,
Value>;

template <typename Combine, typename Value>
concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
const Value&, const Value&> &&
Value&&, Value&&> &&
std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
const Value&, const Value&>,
Value&&, Value&&>,
Value>;

} // namespace d0
Expand Down Expand Up @@ -390,14 +390,15 @@ class lambda_reduce_body {
, my_value(other.my_identity_element)
{ }
void operator()(Range& range) {
my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value));
}

void join( lambda_reduce_body& rhs ) {
my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
const_cast<const Value&>(rhs.my_value));
my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value));
}
Value result() const {
return my_value;

__TBB_nodiscard Value&& result() && noexcept {
return std::move(my_value);
}
};

Expand Down Expand Up @@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
::run(range, body, __TBB_DEFAULT_PARTITIONER() );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and simple_partitioner.
Expand All @@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
::run(range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and auto_partitioner
Expand All @@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and static_partitioner
Expand All @@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction and affinity_partitioner
Expand All @@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
::run( range, body, partitioner );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, default partitioner and user-supplied context.
Expand All @@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, simple partitioner and user-supplied context.
Expand All @@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, auto_partitioner and user-supplied context
Expand All @@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, static_partitioner and user-supplied context
Expand All @@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with reduction, affinity_partitioner and user-supplied context
Expand All @@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
::run( range, body, partitioner, context );
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction and default simple partitioner.
Expand Down Expand Up @@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
::run(range, body, partitioner);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction and static partitioner.
Expand All @@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
::run(range, body, partitioner);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
Expand All @@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
::run(range, body, partitioner, context);
return body.result();
return std::move(body).result();
}

//! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
Expand All @@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
::run(range, body, partitioner, context);
return body.result();
return std::move(body).result();
}
//@}

Expand Down
169 changes: 168 additions & 1 deletion test/conformance/conformance_parallel_reduce.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2005-2023 Intel Corporation
Copyright (c) 2005-2024 Intel Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
#include "common/test_invoke.h"

#include "../tbb/test_partitioner.h"
#include <list>

//! \file conformance_parallel_reduce.cpp
//! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification
Expand Down Expand Up @@ -56,6 +57,67 @@ struct ReduceBody {
}
};

// The type that is copyable and can be stored in the container
// that would be copied only in the empty state.
template <typename T>
class NeverCopyWrapper {
public:
NeverCopyWrapper() = default;
NeverCopyWrapper(const T& obj) : my_obj(obj) {}

NeverCopyWrapper(NeverCopyWrapper&&) = default;
NeverCopyWrapper& operator=(NeverCopyWrapper&&) = default;

NeverCopyWrapper(const NeverCopyWrapper&) {
REQUIRE_MESSAGE(false, "Copy constructor of NeverCopyWrapper should never be called");
}

NeverCopyWrapper& operator=(const NeverCopyWrapper&) {
REQUIRE_MESSAGE(false, "Copy assignment of NeverCopyWrapper should never be called");
return *this;
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to =delete; these?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot delete them because if we provide std::vector<NeverCopyWrapper> as an identity to parallel_reduce, it should be copied into each reduce leaf. And even if the empty identity would be copied, it requires value_type to be copy constructible and causes compilation issues otherwise.
Because of this, I decided to make it copiable to fix the compilation but deny to actually call the ctor and assignment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have investigated one more time, and since we don't use std::vector directly, only using the wrapper, we can control that actual copy constructor would not be called by the container. So I removed NeverCopyWrapper and changed it to be MoveOnlyWrapper


bool operator==(const NeverCopyWrapper& other) const { return my_obj == other.my_obj; }
private:
T my_obj;
}; // class NeverCopyWrapper

// The container wrapper that is copyable but the copy constructor fails if the source container is non-empty
// If such an empty container is provided as an identity into parallel reduce algorithm with rvalue-friendly body,
// it should only call the copy constructor while broadcasting the identity element into the leafs
// and the identity element is an empty container for the further test
template <typename T>
class EmptyCopyList {
public:
EmptyCopyList() = default;

EmptyCopyList(EmptyCopyList&&) = default;
EmptyCopyList& operator=(EmptyCopyList&&) = default;

EmptyCopyList(const EmptyCopyList& other) {
REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
}
EmptyCopyList& operator=(const EmptyCopyList& other) {
REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
return *this;
}

typename std::list<T>::iterator insert(typename std::list<T>::const_iterator pos, T&& item) {
return my_list.insert(pos, std::move(item));
}

void splice(typename std::list<T>::const_iterator pos, EmptyCopyList&& other) {
my_list.splice(pos, std::move(other.my_list));
}

typename std::list<T>::const_iterator end() const { return my_list.end(); }

bool operator==(const EmptyCopyList& other) const { return my_list == other.my_list; }

private:
std::list<T> my_list;
}; // class EmptyCopyList

template <class Partitioner>
void TestDeterministicReductionFor() {
const int N = 1000;
Expand Down Expand Up @@ -174,3 +236,108 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") {
}

#endif

template <typename Runner, typename... PartitionerContext>
void test_vector_of_lists_rvalue_reduce_basic(const Runner& runner, PartitionerContext&&... args) {
constexpr std::size_t n_vectors = 10'000;

using inner_type = NeverCopyWrapper<int>;
using list_type = EmptyCopyList<inner_type>;
using vector_of_lists_type = std::vector<list_type>;

vector_of_lists_type vector_of_lists;

vector_of_lists.reserve(n_vectors);
for (std::size_t i = 0; i < n_vectors; ++i) {
list_type list;

list.insert(list.end(), inner_type{1});
list.insert(list.end(), inner_type{2});
list.insert(list.end(), inner_type{3});
list.insert(list.end(), inner_type{4});
list.insert(list.end(), inner_type{5});
vector_of_lists.emplace_back(std::move(list));
}

oneapi::tbb::blocked_range<std::size_t> range(0, n_vectors, n_vectors * 2);

auto reduce_body = [&](const decltype(range)& range_obj, list_type&& x) {
list_type new_list = std::move(x);

for (std::size_t index = range_obj.begin(); index != range_obj.end(); ++index) {
new_list.splice(new_list.end(), std::move(vector_of_lists[index]));
}
return new_list;
};

auto join_body = [&](list_type&& x, list_type&& y) {
list_type new_list = std::move(x);

new_list.splice(new_list.end(), std::move(y));
return new_list;
};

list_type result = runner(range, list_type{}, reduce_body, join_body, std::forward<PartitionerContext>(args)...);

list_type expected_result;

for (std::size_t i = 0; i < n_vectors; ++i) {
expected_result.insert(expected_result.end(), inner_type{1});
expected_result.insert(expected_result.end(), inner_type{2});
expected_result.insert(expected_result.end(), inner_type{3});
expected_result.insert(expected_result.end(), inner_type{4});
expected_result.insert(expected_result.end(), inner_type{5});
}

REQUIRE_MESSAGE(expected_result == result, "Incorrect reduce result");
}

struct ReduceRunner {
template <typename... Args>
auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_reduce(std::forward<Args>(args)...)) {
return oneapi::tbb::parallel_reduce(std::forward<Args>(args)...);
}
};

struct DeterministicReduceRunner {
template <typename... Args>
auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...)) {
return oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...);
}
Comment on lines +296 to +298

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the trailing return type helpful versus just leave it to auto (or decltype(auto))?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are forced to use trailing return type, because both auto with automatic deduction and decltype(auto) are C++14 features and we still need to support C++11.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. That's annoying.

};

void test_vector_of_lists_rvalue_reduce() {
ReduceRunner runner;
oneapi::tbb::affinity_partitioner af_partitioner;
oneapi::tbb::task_group_context context;

test_vector_of_lists_rvalue_reduce_basic(runner);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner);

test_vector_of_lists_rvalue_reduce_basic(runner, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner, context);
}

void test_vector_of_lists_rvalue_deterministic_reduce() {
DeterministicReduceRunner runner;
oneapi::tbb::task_group_context context;

test_vector_of_lists_rvalue_reduce_basic(runner);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});

test_vector_of_lists_rvalue_reduce_basic(runner, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
}

TEST_CASE("test rvalue optimization") {
test_vector_of_lists_rvalue_reduce();
test_vector_of_lists_rvalue_deterministic_reduce();
}
Loading