oneapi-src · kboyarinov · Feb 14, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/include/oneapi/tbb/parallel_reduce.h b/include/oneapi/tbb/parallel_reduce.h
@@ -35,9 +35,9 @@ inline namespace d0 {
 
 template <typename Body, typename Range>
 concept parallel_reduce_body = splittable<Body> &&
-                               requires( Body& body, const Range& range, Body& rhs ) {
+                               requires( Body& body, const Range& range, Body&& rhs ) {
                                    body(range);
-                                   body.join(rhs);
+                                   body.join(std::move(rhs));
                                };
 
 template <typename Function, typename Range, typename Value>
@@ -390,14 +390,15 @@ class lambda_reduce_body {
         , my_value(other.my_identity_element)
     { }
     void operator()(Range& range) {
-        my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
+        my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value));
     }
+
     void join( lambda_reduce_body& rhs ) {
-        my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
-                                                     const_cast<const Value&>(rhs.my_value));
+        my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value));
     }
-    Value result() const {
-        return my_value;
+
+    Value&& result() noexcept {
+        return std::move(my_value);
     }
 };
 
@@ -411,7 +412,7 @@ class lambda_reduce_body {
     - \code Body::~Body(); \endcode                     Destructor
     - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r
                                                         and accumulating the result
-    - \code void Body::join( Body& b ); \endcode        Join results.
+    - \code void Body::join( Body&& b ); \endcode        Join results.
                                                         The result in \c b should be merged into the result of \c this
 **/
 
@@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction and simple_partitioner.
@@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run(range, body, partitioner );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction and auto_partitioner
@@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction and static_partitioner
@@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction and affinity_partitioner
@@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction, default partitioner and user-supplied context.
@@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
@@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
@@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction, static_partitioner and user-supplied context
@@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
@@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with deterministic reduction and default simple partitioner.
@@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
                           ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with deterministic reduction and static partitioner.
@@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
@@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body.result());
 }
 
 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
@@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body.result());
 }
 //@}
 

diff --git a/test/conformance/conformance_parallel_reduce.cpp b/test/conformance/conformance_parallel_reduce.cpp
@@ -19,6 +19,7 @@
 #include "common/test_invoke.h"
 
 #include "../tbb/test_partitioner.h"
+#include <vector>
 
 //! \file conformance_parallel_reduce.cpp
 //! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification
@@ -56,6 +57,37 @@ struct ReduceBody {
     }
 };
 
+template <class T>
+struct vector_wrapper : std::vector<T> {
+    using base_vector = std::vector<T>;
+
+    struct non_empty_exception {};
+
+    using base_vector::base_vector;
+    vector_wrapper() = default;
+
+    vector_wrapper(vector_wrapper&&) = default;
+    vector_wrapper& operator=(vector_wrapper&&) = default;
+
+    vector_wrapper(const vector_wrapper& other)
+        : base_vector(other)
+    {
+        if (!other.empty()) {
+            throw non_empty_exception{};
+        }
+    }
+
+    vector_wrapper& operator=(const vector_wrapper& other) {
+        if (this != &other) {
+            if (!other.empty()) {
+                throw non_empty_exception{};
+            }
+            this->operator=(other);
+        }
+        return *this;
+    }
+}; // struct vector_wrapper
+
 template <class Partitioner>
 void TestDeterministicReductionFor() {
     const int N = 1000;
@@ -174,3 +206,42 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") {
 }
 
 #endif
+
+TEST_CASE("test rvalue optimization") {
+    constexpr std::size_t n_vectors = 10000;
+    std::vector<vector_wrapper<int>> vector_of_vectors;
+    auto init = {1, 2, 3, 4, 5};
+
+    vector_of_vectors.reserve(n_vectors);
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        vector_of_vectors.emplace_back(vector_wrapper<int>(init));
+    }
+
+    oneapi::tbb::blocked_range<std::size_t> range(0, n_vectors);
+
+    auto reduce_body = [&](const oneapi::tbb::blocked_range<std::size_t>& rng, vector_wrapper<int>&& x) {
+        vector_wrapper<int> new_vector = std::move(x);
+        for (std::size_t index = rng.begin(); index != rng.end(); ++index) {
+            new_vector.reserve(vector_of_vectors[index].size());
+            new_vector.insert(new_vector.end(), std::make_move_iterator(vector_of_vectors[index].begin()), std::make_move_iterator(vector_of_vectors[index].end()));
+        }
+        return new_vector;
+    };
+
+    auto join_body = [](vector_wrapper<int>&& x, vector_wrapper<int>&& y) {
+        vector_wrapper<int> new_vector = std::move(x);
+        new_vector.reserve(new_vector.size() + y.size());
+        new_vector.insert(new_vector.end(), std::make_move_iterator(y.begin()), std::make_move_iterator(y.end()));
+        return new_vector;
+    };
+
+    vector_wrapper<int> result = oneapi::tbb::parallel_reduce(range, vector_wrapper<int>{}, reduce_body, join_body);
+
+    vector_wrapper<int> expected_vector;
+    expected_vector.reserve(5 * n_vectors);
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        expected_vector.insert(expected_vector.end(), init);
+    }
+
+    REQUIRE_MESSAGE(expected_vector == result, "Incorrect reduce result");
+}