oneapi-src · kboyarinov · Feb 14, 2024 · Feb 1, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/include/oneapi/tbb/parallel_reduce.h b/include/oneapi/tbb/parallel_reduce.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -42,16 +42,16 @@ concept parallel_reduce_body = splittable<Body> &&
 
 template <typename Function, typename Range, typename Value>
 concept parallel_reduce_function = std::invocable<const std::remove_reference_t<Function>&,
-                                                  const Range&, const Value&> &&
+                                                  const Range&, Value&&> &&
                                    std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Function>&,
-                                                                            const Range&, const Value&>,
+                                                                            const Range&, Value&&>,
                                                         Value>;
 
 template <typename Combine, typename Value>
 concept parallel_reduce_combine = std::invocable<const std::remove_reference_t<Combine>&,
-                                                 const Value&, const Value&> &&
+                                                 Value&&, Value&&> &&
                                   std::convertible_to<std::invoke_result_t<const std::remove_reference_t<Combine>&,
-                                                                           const Value&, const Value&>,
+                                                                           Value&&, Value&&>,
                                                       Value>;
 
 } // namespace d0
@@ -390,14 +390,15 @@ class lambda_reduce_body {
         , my_value(other.my_identity_element)
     { }
     void operator()(Range& range) {
-        my_value = tbb::detail::invoke(my_real_body, range, const_cast<const Value&>(my_value));
+        my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value));
     }
+
     void join( lambda_reduce_body& rhs ) {
-        my_value = tbb::detail::invoke(my_reduction, const_cast<const Value&>(my_value),
-                                                     const_cast<const Value&>(rhs.my_value));
+        my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value));
     }
-    Value result() const {
-        return my_value;
+
+    __TBB_nodiscard Value&& result() && noexcept {
+        return std::move(my_value);
     }
 };
 
@@ -514,7 +515,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run(range, body, __TBB_DEFAULT_PARTITIONER() );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and simple_partitioner.
@@ -527,7 +528,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run(range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and auto_partitioner
@@ -540,7 +541,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and static_partitioner
@@ -553,7 +554,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction and affinity_partitioner
@@ -566,7 +567,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, default partitioner and user-supplied context.
@@ -579,7 +580,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const __TBB_DEFAULT_PARTITIONER>
                           ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, simple partitioner and user-supplied context.
@@ -592,7 +593,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const simple_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, auto_partitioner and user-supplied context
@@ -605,7 +606,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const auto_partitioner>
                           ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, static_partitioner and user-supplied context
@@ -618,7 +619,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,const static_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with reduction, affinity_partitioner and user-supplied context
@@ -631,7 +632,7 @@ Value parallel_reduce( const Range& range, const Value& identity, const RealBody
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>,affinity_partitioner>
                                         ::run( range, body, partitioner, context );
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction and default simple partitioner.
@@ -704,7 +705,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range,Value,RealBody,Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range,lambda_reduce_body<Range,Value,RealBody,Reduction>, const simple_partitioner>
                           ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction and static partitioner.
@@ -716,7 +717,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context.
@@ -739,7 +740,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const simple_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body).result();
 }
 
 //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context.
@@ -752,7 +753,7 @@ Value parallel_deterministic_reduce( const Range& range, const Value& identity,
     lambda_reduce_body<Range, Value, RealBody, Reduction> body(identity, real_body, reduction);
     start_deterministic_reduce<Range, lambda_reduce_body<Range, Value, RealBody, Reduction>, const static_partitioner>
         ::run(range, body, partitioner, context);
-    return body.result();
+    return std::move(body).result();
 }
 //@}
 

diff --git a/test/conformance/conformance_parallel_reduce.cpp b/test/conformance/conformance_parallel_reduce.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "common/test_invoke.h"
 
 #include "../tbb/test_partitioner.h"
+#include <list>
 
 //! \file conformance_parallel_reduce.cpp
 //! \brief Test for [algorithms.parallel_reduce algorithms.parallel_deterministic_reduce] specification
@@ -56,6 +57,67 @@ struct ReduceBody {
     }
 };
 
+// The type that is copyable and can be stored in the container
+// that would be copied only in the empty state.
+template <typename T>
+class NeverCopyWrapper {
+public:
+    NeverCopyWrapper() = default;
+    NeverCopyWrapper(const T& obj) : my_obj(obj) {}
+
+    NeverCopyWrapper(NeverCopyWrapper&&) = default;
+    NeverCopyWrapper& operator=(NeverCopyWrapper&&) = default;
+
+    NeverCopyWrapper(const NeverCopyWrapper&) {
+        REQUIRE_MESSAGE(false, "Copy constructor of NeverCopyWrapper should never be called");
+    }
+
+    NeverCopyWrapper& operator=(const NeverCopyWrapper&) {
+        REQUIRE_MESSAGE(false, "Copy assignment of NeverCopyWrapper should never be called");
+        return *this;
+    }
+
+    bool operator==(const NeverCopyWrapper& other) const { return my_obj == other.my_obj; }
+private:
+    T my_obj;
+}; // class NeverCopyWrapper
+
+// The container wrapper that is copyable but the copy constructor fails if the source container is non-empty
+// If such an empty container is provided as an identity into parallel reduce algorithm with rvalue-friendly body,
+// it should only call the copy constructor while broadcasting the identity element into the leafs
+// and the identity element is an empty container for the further test
+template <typename T>
+class EmptyCopyList {
+public:
+    EmptyCopyList() = default;
+
+    EmptyCopyList(EmptyCopyList&&) = default;
+    EmptyCopyList& operator=(EmptyCopyList&&) = default;
+
+    EmptyCopyList(const EmptyCopyList& other) {
+        REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
+    }
+    EmptyCopyList& operator=(const EmptyCopyList& other) {
+        REQUIRE_MESSAGE(other.my_list.empty(), "reduce copied non-identity list");
+        return *this;
+    }
+
+    typename std::list<T>::iterator insert(typename std::list<T>::const_iterator pos, T&& item) {
+        return my_list.insert(pos, std::move(item));
+    }
+
+    void splice(typename std::list<T>::const_iterator pos, EmptyCopyList&& other) {
+        my_list.splice(pos, std::move(other.my_list));
+    }
+
+    typename std::list<T>::const_iterator end() const { return my_list.end(); }
+
+    bool operator==(const EmptyCopyList& other) const { return my_list == other.my_list; }
+
+private:
+    std::list<T> my_list;
+}; // class EmptyCopyList
+
 template <class Partitioner>
 void TestDeterministicReductionFor() {
     const int N = 1000;
@@ -174,3 +236,108 @@ TEST_CASE("parallel_[deterministic_]reduce and std::invoke") {
 }
 
 #endif
+
+template <typename Runner, typename... PartitionerContext>
+void test_vector_of_lists_rvalue_reduce_basic(const Runner& runner, PartitionerContext&&... args) {
+    constexpr std::size_t n_vectors = 10'000;
+
+    using inner_type = NeverCopyWrapper<int>;
+    using list_type = EmptyCopyList<inner_type>;
+    using vector_of_lists_type = std::vector<list_type>;
+
+    vector_of_lists_type vector_of_lists;
+
+    vector_of_lists.reserve(n_vectors);
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        list_type list;
+
+        list.insert(list.end(), inner_type{1});
+        list.insert(list.end(), inner_type{2});
+        list.insert(list.end(), inner_type{3});
+        list.insert(list.end(), inner_type{4});
+        list.insert(list.end(), inner_type{5});
+        vector_of_lists.emplace_back(std::move(list));
+    }
+
+    oneapi::tbb::blocked_range<std::size_t> range(0, n_vectors, n_vectors * 2);
+
+    auto reduce_body = [&](const decltype(range)& range_obj, list_type&& x) {
+        list_type new_list = std::move(x);
+
+        for (std::size_t index = range_obj.begin(); index != range_obj.end(); ++index) {
+            new_list.splice(new_list.end(), std::move(vector_of_lists[index]));
+        }
+        return new_list;
+    };
+
+    auto join_body = [&](list_type&& x, list_type&& y) {
+        list_type new_list = std::move(x);
+
+        new_list.splice(new_list.end(), std::move(y));
+        return new_list;
+    };
+
+    list_type result = runner(range, list_type{}, reduce_body, join_body, std::forward<PartitionerContext>(args)...);
+
+    list_type expected_result;
+
+    for (std::size_t i = 0; i < n_vectors; ++i) {
+        expected_result.insert(expected_result.end(), inner_type{1});
+        expected_result.insert(expected_result.end(), inner_type{2});
+        expected_result.insert(expected_result.end(), inner_type{3});
+        expected_result.insert(expected_result.end(), inner_type{4});
+        expected_result.insert(expected_result.end(), inner_type{5});
+    }
+
+    REQUIRE_MESSAGE(expected_result == result, "Incorrect reduce result");
+}
+
+struct ReduceRunner {
+    template <typename... Args>
+    auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_reduce(std::forward<Args>(args)...)) {
+        return oneapi::tbb::parallel_reduce(std::forward<Args>(args)...);
+    }
+};
+
+struct DeterministicReduceRunner {
+    template <typename... Args>
+    auto operator()(Args&&... args) const -> decltype(oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...)) {
+        return oneapi::tbb::parallel_deterministic_reduce(std::forward<Args>(args)...);
+    }
+};
+
+void test_vector_of_lists_rvalue_reduce() {
+    ReduceRunner runner;
+    oneapi::tbb::affinity_partitioner af_partitioner;
+    oneapi::tbb::task_group_context context;
+
+    test_vector_of_lists_rvalue_reduce_basic(runner);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner);
+
+    test_vector_of_lists_rvalue_reduce_basic(runner, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::auto_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, af_partitioner, context);
+}
+
+void test_vector_of_lists_rvalue_deterministic_reduce() {
+    DeterministicReduceRunner runner;
+    oneapi::tbb::task_group_context context;
+
+    test_vector_of_lists_rvalue_reduce_basic(runner);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{});
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{});
+
+    test_vector_of_lists_rvalue_reduce_basic(runner, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::simple_partitioner{}, context);
+    test_vector_of_lists_rvalue_reduce_basic(runner, oneapi::tbb::static_partitioner{}, context);
+}
+
+TEST_CASE("test rvalue optimization") {
+    test_vector_of_lists_rvalue_reduce();
+    test_vector_of_lists_rvalue_deterministic_reduce();
+}