diff --git a/doc/GSG/get_started.rst b/doc/GSG/get_started.rst index ec41123e4c..ed69b022e7 100644 --- a/doc/GSG/get_started.rst +++ b/doc/GSG/get_started.rst @@ -9,6 +9,8 @@ Get Started with |short_name| .. include:: before_beginning_and_example.rst +.. include:: hybrid_cpu_support.rst + Find more ********* diff --git a/doc/GSG/hybrid_cpu_support.rst b/doc/GSG/hybrid_cpu_support.rst new file mode 100644 index 0000000000..a2b32f12b4 --- /dev/null +++ b/doc/GSG/hybrid_cpu_support.rst @@ -0,0 +1,40 @@ +.. _hybrid_cpu_support: + +Hybrid CPU and NUMA Support +*************************** + +If you need NUMA/Hybrid CPU support in oneTBB, you need to make sure that HWLOC* is installed on your system. + +HWLOC* (Hardware Locality) is a library that provides a portable abstraction of the hierarchical topology of modern architectures (NUMA, hybrid CPU systems, etc). +oneTBB relies on HWLOC* to identify the underlying topology of the system to optimize thread scheduling and memory allocation. + +Without HWLOC*, oneTBB may not take advantage of NUMA/Hybrid CPU support. Therefore, it's important to make sure that HWLOC* is installed before using oneTBB on such systems. + +Check HWLOC* on the System +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To check if HWLOC* is already installed on your system, run `hwloc-ls`: + + * For Linux* OS, in the command line. + * For Windows* OS, in the command prompt. + +If HWLOC* is installed, the command displays information about the hardware topology of your system. +If it is not installed, you receive an error message saying that the command ``hwloc-ls`` could not be found. + +.. note:: For Hybrid CPU support, make sure that HWLOC* is version 2.5 or higher. + For NUMA support, install HWLOC* version 1.11 or higher. + +Install HWLOC* +^^^^^^^^^^^^^^ + +To install HWLOC*, visit the official Portable Hardware Locality website (https://www-lb.open-mpi.org/projects/hwloc/). + +* For Windows* OS, binaries are available for download. +* For Linux* OS, only the source code is provided and binaries should be built. + +On Linux* OS, HWLOC* can be also installed with package managers, such as APT*, YUM*, etc. +To do so, run: ``sudo apt install hwloc``. + + +.. note:: For Hybrid CPU support, make sure that HWLOC* is version 2.5 or higher. + For NUMA support, install HWLOC* version 1.11 or higher. diff --git a/doc/conf.py b/doc/conf.py index 4c7e2b8a4b..39a5ca90c9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -29,7 +29,7 @@ project = u'IntelĀ® oneAPI Threading Building Blocks (oneTBB)' else: project = u'oneTBB' -copyright = u'2022, Intel Corporation' +copyright = u'2023, Intel Corporation' author = u'Intel' # The short X.Y version diff --git a/doc/main/reference/reference.rst b/doc/main/reference/reference.rst index 9c8bca526a..87d05c3267 100644 --- a/doc/main/reference/reference.rst +++ b/doc/main/reference/reference.rst @@ -19,6 +19,7 @@ It also describes features that are not included in the oneTBB specification. info_namespace parallel_for_each_semantics parallel_sort_ranges_extension + scalable_memory_pools/malloc_replacement_log Preview features **************** diff --git a/doc/main/reference/scalable_memory_pools.rst b/doc/main/reference/scalable_memory_pools.rst index 6cf81139b7..d04cd54a90 100644 --- a/doc/main/reference/scalable_memory_pools.rst +++ b/doc/main/reference/scalable_memory_pools.rst @@ -41,3 +41,4 @@ Here, ``P`` represents an instance of the memory pool class. scalable_memory_pools/memory_pool_cls scalable_memory_pools/fixed_pool_cls scalable_memory_pools/memory_pool_allocator_cls + diff --git a/doc/main/reference/scalable_memory_pools/malloc_replacement_log.rst b/doc/main/reference/scalable_memory_pools/malloc_replacement_log.rst new file mode 100644 index 0000000000..8fea89f949 --- /dev/null +++ b/doc/main/reference/scalable_memory_pools/malloc_replacement_log.rst @@ -0,0 +1,84 @@ +.. _malloc_replacement_log: + +TBB_malloc_replacement_log Function +=================================== + +.. note:: This function is for Windows* OS only. + +Summary +******* + +Provides information about the status of dynamic memory allocation replacement. + +Syntax +******* + +:: + + extern "C" int TBB_malloc_replacement_log(char *** log_ptr); + + +Header +****** + +:: + + #include "oneapi/tbb/tbbmalloc_proxy.h" + + +Description +*********** + +Dynamic replacement of memory allocation functions on Windows* OS uses in-memory binary instrumentation techniques. +To make sure that such instrumentation is safe, oneTBB first searches for a subset of replaced functions in the Visual C++* runtime DLLs +and checks if each one has a known bytecode pattern. If any required function is not found or its bytecode pattern is unknown, the replacement is skipped, +and the program continues to use the standard memory allocation functions. + +The ``TBB_malloc_replacement_log`` function allows the program to check if the dynamic memory replacement happens and to get a log of the performed checks. + +**Returns:** + +* 0, if all necessary functions are successfully found and the replacement takes place. +* 1, otherwise. + +The ``log_ptr`` parameter must be an address of a char** variable or be ``NULL``. If it is not ``NULL``, the function writes there the address of an array of +NULL-terminated strings containing detailed information about the searched functions in the following format: + +:: + + search_status: function_name (dll_name), byte pattern: + + +For more information about the replacement of dynamic memory allocation functions, see :ref:`Windows_C_Dynamic_Memory_Interface_Replacement`. + + +Example +******* + +:: + + #include "oneapi/tbb/tbbmalloc_proxy.h" + #include + + int main(){ + char **func_replacement_log; + int func_replacement_status = TBB_malloc_replacement_log(&func_replacement_log); + + if (func_replacement_status != 0) { + printf("tbbmalloc_proxy cannot replace memory allocation routines\n"); + for (char** log_string = func_replacement_log; *log_string != 0; log_string++) { + printf("%s\n",*log_string); + } + } + + return 0; + } + + +Example output: + +:: + + tbbmalloc_proxy cannot replace memory allocation routines + Success: free (ucrtbase.dll), byte pattern: + Fail: _msize (ucrtbase.dll), byte pattern: diff --git a/doc/main/tbb_userguide/Floating_Point_Settings.rst b/doc/main/tbb_userguide/Floating_Point_Settings.rst new file mode 100644 index 0000000000..4618f56ae5 --- /dev/null +++ b/doc/main/tbb_userguide/Floating_Point_Settings.rst @@ -0,0 +1,60 @@ +.. _Floating_Point_Settings: + +Floating-point Settings +======================= + +To propagate CPU-specific settings for floating-point computations to tasks executed by the task scheduler, you can use one of the following two methods: + +* When a ``task_arena`` or a task scheduler for a given application thread is initialized, they capture the current floating-point settings of the thread. +* The ``task_group_context`` class has a method to capture the current floating-point settings. + +By default, worker threads use floating-point settings obtained during the initialization of a ``task_arena`` or the implicit arena of the application thread. The settings are applied to all computations within that ``task_arena`` or started by that application thread. + + +For better control over floating point behavior, a thread may capture the current settings in a task group context. Do it at context creation with a special flag passed to the constructor: + +:: + + task_group_context ctx( task_group_context::isolated, + task_group_context::default_traits | task_group_context::fp_settings ); + + +Or call the ``capture_fp_settings`` method: + +:: + + task_group_context ctx; + ctx.capture_fp_settings(); + + +You can then pass the task group context to most parallel algorithms, including ``flow::graph``, to ensure that all tasks related to this algorithm use the specified floating-point settings. +It is possible to execute the parallel algorithms with different floating-point settings captured to separate contexts, even at the same time. + +Floating-point settings captured to a task group context prevail over the settings captured during task scheduler initialization. It means, if a context is passed to a parallel algorithm, the floating-point settings captured to the context are used. +Otherwise, if floating-point settings are not captured to the context, or a context is not explicitly specified, the settings captured during the task arena initialization are used. + +In a nested call to a parallel algorithm that does not use the context of a task group with explicitly captured floating-point settings, the outer-level settings are used. +If none of the outer-level contexts capture floating-point settings, the settings captured during task arena initialization are used. + +It guarantees that: + +* Floating-point settings are applied to all tasks executed within a task arena, if they are captured: + + * To a task group context. + * During the arena initialization. + +* A call to a oneTBB parallel algorithm does not change the floating-point settings of the calling thread, even if the algorithm uses different settings. + +.. note:: + The guarantees above apply only to the following conditions: + + * A user code inside a task should: + + * Not change the floating-point settings. + * Revert any modifications. + * Restore previous settings before the end of the task. + + * oneTBB task scheduler observers are not used to set or modify floating point settings. + + Otherwise, the stated guarantees are not valid and the behavior related to floating-point settings is undefined. + diff --git a/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst b/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst index 939f713cd3..05786fbd82 100644 --- a/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst +++ b/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst @@ -172,13 +172,13 @@ equivalent version of the previous example that does this follows: void RunPipeline( int ntoken, FILE* input_file, FILE* output_file ) { - oneapi::tbb::filter_mode f1( oneapi::tbb::filter_mode::serial_in_order, + oneapi::tbb::filter f1( oneapi::tbb::filter_mode::serial_in_order, MyInputFunc(input_file) ); - oneapi::tbb::filter_mode f2(oneapi::tbb::filter_mode::parallel, + oneapi::tbb::filter f2(oneapi::tbb::filter_mode::parallel, MyTransformFunc() ); - oneapi::tbb::filter_mode f3(oneapi::tbb::filter_mode::serial_in_order, + oneapi::tbb::filter f3(oneapi::tbb::filter_mode::serial_in_order, MyOutputFunc(output_file) ); - oneapi::tbb::filter_mode f = f1 & f2 & f3; + oneapi::tbb::filter f = f1 & f2 & f3; oneapi::tbb::parallel_pipeline(ntoken,f); } diff --git a/doc/main/tbb_userguide/title.rst b/doc/main/tbb_userguide/title.rst index c57cf2f6c2..b51c3294b8 100644 --- a/doc/main/tbb_userguide/title.rst +++ b/doc/main/tbb_userguide/title.rst @@ -14,6 +14,7 @@ ../tbb_userguide/Flow_Graph ../tbb_userguide/work_isolation ../tbb_userguide/Exceptions_and_Cancellation + ../tbb_userguide/Floating_Point_Settings ../tbb_userguide/Containers ../tbb_userguide/Mutual_Exclusion ../tbb_userguide/Timing