Keep separate per-pipeline operator counters. Error out when "stealin…

…g" subgraphs from other pipelines results in duplicate names. (#5506) Prior to this change constructing exactly the same pipeline (e.g. by calling a function decorated with @pipeline_def) multiple times produced pipelines with different operator instance names and differently named operator instances and DataNodes. This PR changes that so that pipelines with the same structure have the same node names. This is achieved by: * Using a separate operator counter in each pipeline. When all nodes all instantiated within pipeline scope, no further action is required. This happens for the vast majority of cases. * If a name collision occurs (only possible when "stealing" a subgraph from another pipeline), an error is raised. Pipelines that are defined without a "current" pipeline have distinct operator instance names. Additionally, there were some problems with operator discovery. I rewrote it to a much simpler DFS. ---- Signed-off-by: Michal Zientkiewicz <[email protected]>
NVIDIA · Jun 11, 2024 · 9dfdeac · 9dfdeac
1 parent f850059
commit 9dfdeac
Show file tree

Hide file tree

Showing 10 changed files with 184 additions and 54 deletions.
diff --git a/dali/pipeline/operator/op_spec.h b/dali/pipeline/operator/op_spec.h
@@ -264,6 +264,16 @@ class DLL_PUBLIC OpSpec {
     return outputs_[idx].device;
   }
 
+  DLL_PUBLIC inline void RenameInput(int idx, std::string name) {
+    DALI_ENFORCE_VALID_INDEX(idx, NumInput());
+    inputs_[idx].name = std::move(name);
+  }
+
+  DLL_PUBLIC inline void RenameOutput(int idx, std::string name) {
+    DALI_ENFORCE_VALID_INDEX(idx, NumOutput());
+    outputs_[idx].name = std::move(name);
+  }
+
   DLL_PUBLIC inline auto &ArgumentInputs() const {
     return argument_inputs_;
   }

diff --git a/dali/pipeline/pipeline.cc b/dali/pipeline/pipeline.cc
@@ -312,7 +312,7 @@ int Pipeline::AddOperatorImpl(const OpSpec &const_spec, const std::string &inst_
 
     DALI_ENFORCE(it != edge_names_.end(),
                  make_string("Data node \"", input_name, "\" requested as ", FormatInput(spec, i),
-                             " to the operator is not known to the pipeline."));
+                             " to operator \"", inst_name, "\" is not known to the pipeline."));
 
     // Table of possible scenarios:
     // Op location / requested input type / data location
@@ -360,7 +360,7 @@ int Pipeline::AddOperatorImpl(const OpSpec &const_spec, const std::string &inst_
     DALI_ENFORCE(
         it != edge_names_.end(),
         make_string("Data node \"", input_name, "\" requested as ", FormatArgument(spec, arg_name),
-                    " to operator is not known to the pipeline."));
+                    " to operator \"", inst_name, "\" is not known to the pipeline."));
 
     if (!it->second.has_cpu) {
       assert(it->second.has_gpu);

diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
@@ -2152,6 +2152,15 @@ PYBIND11_MODULE(backend_impl, m) {
         py::return_value_policy::reference_internal)
     .def("AddOutput", &OpSpec::AddOutput,
         py::return_value_policy::reference_internal)
+    .def("RenameInput", &OpSpec::RenameInput, "idx"_a, "name"_a)
+    .def("RenameOutput", &OpSpec::RenameOutput, "idx"_a, "name"_a)
+    .def("InputName", &OpSpec::InputName, "idx"_a)
+    .def("InputDevice", &OpSpec::InputDevice, "idx"_a)
+    .def("OutputName", &OpSpec::OutputName, "idx"_a)
+    .def("OutputDevice", &OpSpec::OutputDevice, "idx"_a)
+    .def("NumInput", &OpSpec::NumInput)
+    .def("NumRegularInput", &OpSpec::NumRegularInput)
+    .def("NumOutput", &OpSpec::NumOutput)
     DALI_OPSPEC_ADDARG(std::string)
     DALI_OPSPEC_ADDARG(bool)
     DALI_OPSPEC_ADDARG(int64)

diff --git a/dali/python/nvidia/dali/data_node.py b/dali/python/nvidia/dali/data_node.py
@@ -73,7 +73,7 @@ def __init__(self, name, device="cpu", source=None):
         self.source = source
 
     def __str__(self):
-        return f'DataNode(name="{self.name}", device="{self.device}")'
+        return f'DataNode(name="{self.name}", device="{self.device}, source="{self.source}")'
 
     __repr__ = __str__
 

diff --git a/dali/python/nvidia/dali/ops/__init__.py b/dali/python/nvidia/dali/ops/__init__.py
@@ -17,6 +17,7 @@
 import threading
 import tree
 import warnings
+import weakref
 from itertools import count
 
 import nvidia.dali.python_function_plugin
@@ -369,11 +370,16 @@ def __init__(self, inputs, arg_inputs, arguments, _processed_arguments, op):
         op : Operator class.
             Operator class containing the schema, and spec filled with `processed_arguments`.
         """
-        self._counter = _OpCounter()
+
+        if _Pipeline.current():
+            self._pipeline = weakref.ref(_Pipeline.current())
+        else:
+            self._pipeline = None
+        self._id = None
         self._outputs = []
         self._op = op
         self._spec = op.spec.copy()
-        self._relation_id = self._counter.id
+        self._relation_id = None
 
         if _conditionals.conditionals_enabled():
             inputs, arg_inputs = _conditionals.apply_conditional_split_to_args(inputs, arg_inputs)
@@ -412,8 +418,13 @@ def _process_instance_name(self, arguments):
         name = arguments.pop("name", None)
         if name is not None:
             self._name = name
+            self._autoname = False
         else:
-            self._name = "__" + type(self._op).__name__ + "_" + str(self._counter.id)
+            has_pipeline = self.pipeline is not None
+            # to avoid mixing up global and per-pipeline ids
+            infix = "_" if has_pipeline else "_detached_"
+            self._name = "__" + type(self._op).__name__ + infix + str(self.id)
+            self._autoname = True
 
     def _process_trace(self, arguments):
         from nvidia.dali._debug_mode import _PipelineDebug
@@ -463,9 +474,21 @@ def _generate_outputs(self):
                 pipeline.add_sink(t)
             self.append_output(t)
 
+    @property
+    def pipeline(self):
+        return None if self._pipeline is None else self._pipeline()
+
     @property
     def id(self):
-        return self._counter.id
+        if self._id is None:
+            if self.pipeline is None and _Pipeline.current():
+                self._pipeline = weakref.ref(_Pipeline.current())
+            if self.pipeline:
+                self._id = self.pipeline._next_op_id()
+            else:
+                self._id = _OpCounter().id
+
+        return self._id
 
     @property
     def inputs(self):
@@ -492,6 +515,8 @@ def name(self):
 
     @property
     def relation_id(self):
+        if self._relation_id is None:
+            self._relation_id = id(self)
         return self._relation_id
 
     @relation_id.setter
@@ -629,7 +654,7 @@ def __call__(self, *inputs, **kwargs):
                 )
 
             # Tie the instances together
-            relation_id = op_instances[0].id
+            relation_id = op_instances[0].relation_id
             for op in op_instances:
                 op.relation_id = relation_id
 

diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
@@ -14,7 +14,6 @@
 
 # pylint: disable=no-member
 from typing import Any, List, Tuple, Callable, Optional, Union, TypeVar, overload
-from collections import deque
 from nvidia.dali import backend as b
 from nvidia.dali import types
 from nvidia.dali import internal
@@ -235,6 +234,7 @@ def __init__(
         self._num_threads = num_threads
         self._device_id = device_id
         self._seed = seed
+        self._next_op_id_counter = 0
         self._exec_pipelined = exec_pipelined
         # When initializing DALI, we do the following in order:
         # * Discover the ops specified in Python, group the ExternalSources (_build_graph())
@@ -719,6 +719,48 @@ def __exit__(self, type, value, traceback):
 
         return api_checker(self)
 
+    def _require_unique_names(self):
+        ops_by_name = {}
+        for op in self._ops:
+            ops = ops_by_name.get(op.name, [])
+            ops.append(op)
+        duplicate = {}
+        foreign = False
+        for name, ops in ops_by_name.items():
+            if len(ops) > 1:
+                duplicate[name] = ops
+                for op in ops:
+                    if op.pipeline is not self:
+                        foreign = True
+
+        if duplicate:
+            message = (
+                f"The pipeline is invalid because it contains operators with non-unique names:\n"
+                f"{duplicate}"
+            )
+            if foreign:
+                message += (
+                    "\nThe likely cause is that the pipeline contains a subgraph "
+                    "instantiated while a different pipeline was set as the current "
+                    "pipeline (e.g. inside another pipeline's graph definition function).\n"
+                )
+            raise RuntimeError(message)
+
+    def _require_no_foreign_ops(self, message):
+        foreign = []
+        for op in self._ops:
+            if op.pipeline is not self:
+                foreign.append(op)
+        if foreign:
+            raise RuntimeError(
+                f"{message} because it contains operator(s) "
+                f"that were defined outside the pipeline scope:\n"
+                f"{[o.name for o in foreign]}\n"
+                f"All operators should be defined while the pipeline is set as the current "
+                f"pipeline. This happens automatically when defining the pipeline in a "
+                f"function decorated with `@pipeline_def`."
+            )
+
     # Graph is constructed by backtracking from the output edges and the edges marked as sinks
     def _build_graph(self, define_graph=None):
         if define_graph is not None:
@@ -776,6 +818,10 @@ def contains_nested_datanode(nested):
             _data_node._check(outputs[i])
 
         self._ops = _collect_ops(list(outputs) + self._sinks)
+        self._require_unique_names()
+        if self._enable_checkpointing:
+            self._require_no_foreign_ops("The pipeline does not support checkpointing")
+
         self._graph_outputs = outputs
         self._setup_input_callbacks()
         self._disable_pruned_external_source_instances()
@@ -960,6 +1006,11 @@ def _restore_state_from_checkpoint(self):
             self._iterator_data = external_ctx_cpt.iterator_data
             self._is_restored_from_checkpoint = True
 
+    def _next_op_id(self):
+        i = self._next_op_id_counter
+        self._next_op_id_counter += 1
+        return i
+
     def build(self):
         """Build the pipeline.
 
@@ -2101,37 +2152,26 @@ def get_op_input_edges(op) -> List[DataNode]:
             else:
                 yield inp
 
-    def get_op_outputs_num():
-        # BSF traverse the graph first to learn, for each reachable operator in the graph,
-        # how many data-nodes/edges the operator contributes to
-        # (i.e. the number of outputs of the operator instance)
-        op_outputs_num = {}
-        edges = deque(output_nodes)
-        while edges:
-            current_edge = edges.popleft()
-            source_op = get_source_op(current_edge)
-            if source_op.id in op_outputs_num:
-                op_outputs_num[source_op.id] += 1
-            else:
-                op_outputs_num[source_op.id] = 1
-                source_op.check_args()
-                edges.extend(get_op_input_edges(source_op))
-        return op_outputs_num
-
+    visited = set()
     ops = []
-    edges = deque(output_nodes)
-    op_total_outputs_num = get_op_outputs_num()
-    op_visited_outputs_num = {op_id: 0 for op_id in op_total_outputs_num}
-    while edges:
-        current_edge = edges.popleft()
-        source_op = get_source_op(current_edge)
-        op_visited_outputs_num[source_op.id] += 1
-        # Actually visit the operator only when all the nodes it contributes to
-        # were already processed
-        if op_visited_outputs_num[source_op.id] == op_total_outputs_num[source_op.id]:
-            ops.append(source_op)
-            edges.extend(get_op_input_edges(source_op))
-    ops.reverse()
+
+    # Depth-first search returns the graph topologically sorted.
+    # We go over each operator's inputs before adding it to the list.
+
+    def visit_op(op):
+        if id(op) in visited:
+            return
+        visited.add(id(op))
+        op.check_args()
+        # visit conttributing inputs
+        for edge in get_op_input_edges(op):
+            visit_op(get_source_op(edge))
+        # add the operator to the list of contributing ops
+        ops.append(op)
+
+    for edge in output_nodes:
+        visit_op(get_source_op(edge))
+
     return ops
 
 

diff --git a/dali/test/python/auto_aug/test_auto_augment.py b/dali/test/python/auto_aug/test_auto_augment.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -210,8 +210,8 @@ def pipeline():
 @params(
     (False, "cpu", 256),
     (False, "gpu", 512),
-    (True, "cpu", 400),
-    (True, "gpu", 348),
+    (True, "cpu", 2000),
+    (True, "gpu", 2000),
 )
 def test_sub_policy(randomly_negate, dev, batch_size):
     num_magnitude_bins = 10
@@ -305,9 +305,8 @@ def third(data, op_id_mag_id):
                     expected_counts.append(expected)
             stat = chisquare(counts, expected_counts)
             # assert that the magnitudes negation looks independently enough
-            # (0.05 <=), but also that it is not too ideal (i.e. like all
-            # cases happening exactly the expected number of times)
-            assert 0.05 <= stat.pvalue <= 0.95, f"{stat}"
+            # (0.01 <=)
+            assert 0.01 <= stat.pvalue, f"{stat}"
 
 
 @params(("cpu",), ("gpu",))
@@ -397,7 +396,7 @@ def second_stage_only(data, op_id_mag_id):
     )
 
     policy = Policy("MyPolicy", num_magnitude_bins=num_magnitude_bins, sub_policies=sub_policies)
-    p = concat_aug_pipeline(batch_size=batch_size, dev=dev, policy=policy)
+    p = concat_aug_pipeline(batch_size=batch_size, dev=dev, policy=policy, seed=1234)
     p.build()
 
     for _ in range(5):
@@ -415,10 +414,8 @@ def second_stage_only(data, op_id_mag_id):
             actual.append(actual_counts[mags])
             expected.append(expected_counts[mags])
         stat = chisquare(actual, expected)
-        # assert that the magnitudes negation looks independently enough
-        # (0.05 <=), but also that it is not too ideal (i.e. like all
-        # cases happening exactly the expected number of times)
-        assert 0.05 <= stat.pvalue <= 0.95, f"{stat}"
+        # assert that the magnitudes negation looks independently enough (0.01 <=)
+        assert 0.01 <= stat.pvalue, f"{stat}"
 
 
 def test_policy_presentation():

diff --git a/dali/test/python/auto_aug/test_rand_augment.py b/dali/test/python/auto_aug/test_rand_augment.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -312,7 +312,7 @@ def pipeline():
             actual.append(actual_count[out])
             expected.append(expected_counts[out])
         stat = chisquare(actual, expected)
-        assert 0.01 <= stat.pvalue <= 0.99, f"{stat} {actual} {expected}"
+        assert 0.01 <= stat.pvalue, f"{stat} {actual} {expected}"
 
 
 def test_wrong_params_fail():

diff --git a/dali/test/python/auto_aug/test_trivial_augment.py b/dali/test/python/auto_aug/test_trivial_augment.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -236,4 +236,4 @@ def pipeline():
         stat = chisquare(actual, expected)
         stats.append(stat)
     mean_p_val = sum(stat.pvalue for stat in stats) / len(stats)
-    assert 0.05 <= mean_p_val <= 0.95, f"{mean_p_val} {stat} {actual} {expected}"
+    assert 0.01 <= mean_p_val, f"{mean_p_val} {stat} {actual} {expected}"