Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

mcourteaux · 2024-06-28T11:01:29Z

When --for some reason-- an allocation group for fused storage for multiple Funcs that originally are intended to go in GPUShared gets lifted out of the GPU-block loops, and sits in Heap memory instead, the profiling injection logic assumed that this buffer came from a function with the same name. This buffer was incorrectly determined to be on the stack, as it ignored the custom_new and custom_free attributes of the Allocate node.

Consider this example (also included as a new test):

#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {

    Target t = get_jit_target_from_environment();
    if (!t.has_gpu_feature()) {
        printf("[SKIP] GPU not enabled\n");
        return 0;
    }

    Var x{"x"}, y{"y"};

    Func f1{"f1"}, f2{"f2"};
    f1(x, y) = cast<float>(x + y);
    f2(x, y) = f1(x, y) * 2;

    Func result{"result"};
    result(x, y) = f2(x, y);

    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
    result
        .compute_root()
        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
        .reorder(xi, yi, xo, yo)
        ;

    f2.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    f1.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    result.print_loop_nest();


    t.set_feature(Target::Profile); // Make sure profiling is enabled!
    result.compile_jit(t);

    printf("Success!\n");
    return 0;
}

Produces the following Stmt right before the Profiling pass:

assert(reinterpret<uint64>((struct halide_buffer_t *)result.buffer) != (uint64)0, halide_error_buffer_argument_is_null("result"))
let result = (void *)_halide_buffer_get_host((struct halide_buffer_t *)result.buffer)
let result.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)result.buffer)
let result.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)result.buffer)
let result.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 0)
let result.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 0)
let result.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 0)
let result.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 1)
let result.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 1)
let result.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 1)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)result.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)result.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 2, (struct halide_dimension_t *)make_struct((min(result.extent.0, 16) + result.min.0) + -16, max(result.extent.0, 16), 1, 0, (min(result.extent.1, 16) + result.min.1) + -16, max(result.extent.1, 16), max(result.extent.0, 16), 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 assert(result.type == (uint32)73730, halide_error_bad_type("Output buffer result", result.type, (uint32)73730))
 assert(result.dimensions == 2, halide_error_bad_dimensions("Output buffer result", result.dimensions, 2))
 assert((16 <= result.extent.0) && (((max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -16) <= (result.extent.0 + result.min.0)), halide_error_access_out_of_bounds("Output buffer result", 0, (min(result.extent.0, 16) + result.min.0) + -16, (max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -17, result.min.0, (result.extent.0 + result.min.0) + -1))
 assert((16 <= result.extent.1) && (((max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -16) <= (result.extent.1 + result.min.1)), halide_error_access_out_of_bounds("Output buffer result", 1, (min(result.extent.1, 16) + result.min.1) + -16, (max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -17, result.min.1, (result.extent.1 + result.min.1) + -1))
 assert(result.stride.0 == 1, halide_error_constraint_violated("result.stride.0", result.stride.0, "1", 1))
 let result.total_extent.1 = int64(result.extent.1)*int64(result.extent.0)
 assert((uint64)abs(int64(result.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.0)), (uint64)2147483647))
 assert((uint64)abs(int64(result.extent.1)*int64(result.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.1)*int64(result.stride.1)), (uint64)2147483647))
 assert(result.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("result", result.total_extent.1, (int64)2147483647))
 profiling_enable_instance_marker()
 produce result {
  let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)result.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
  allocate allocgroup__f1$0.0__f2$0.1.buffer[float32 * 1]
   custom_new { let t13 = (struct halide_dimension_t *)make_struct(0, 512, 1, 0, 0, (result.extent.0 + 15)/16, 512, 0, 0, (result.extent.1 + 15)/16, ((result.extent.0 + 15)/16)*512, 0) in (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)alloca(size_of_halide_buffer_t()), t13, reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 3, t13, (uint64)0) }
   custom_delete { halide_device_free_as_destructor(allocgroup__f1$0.0__f2$0.1.buffer); }
  let t14 = halide_device_malloc(allocgroup__f1$0.0__f2$0.1.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(t14 == 0, t14)
  let allocgroup__f1$0.0__f2$0.1 = (void *)_halide_buffer_get_device(allocgroup__f1$0.0__f2$0.1.buffer)
  gpu_block<CUDA> (result.s0.y.yo.block_id_y, 0, (result.extent.1 + 15)/16) {
   gpu_block<CUDA> (result.s0.x.xo.block_id_x, 0, (result.extent.0 + 15)/16) {
    gpu_thread<CUDA> (.thread_id_y, 0, 16) {
     gpu_thread<CUDA> (.thread_id_x, 0, 16) {
      let result.s0.y.yi.base.s = min(result.s0.y.yo.block_id_y*16, result.extent.1 + -16)
      let result.s0.x.xi.base.s = min(result.s0.x.xo.block_id_x*16, result.extent.0 + -16)
      produce f1$0 {
       allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)] = float32((((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x) + ((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)))
      }
      gpu_thread_barrier(1)
      produce f2$0 {
       consume f1$0 {
        allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256] = allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)]*2.000000f
       }
      }
      gpu_thread_barrier(1)
      consume f2$0 {
       result[((((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)*result.stride.1) + ((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x)) - ((result.min.1*result.stride.1) + result.min.0)] = allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256]
      }
     }
    }
   }
  }
  free allocgroup__f1$0.0__f2$0.1.buffer
  _halide_buffer_set_device_dirty((struct halide_buffer_t *)result.buffer, (uint1)1)
 }
}

Notice how the allocgroup__f1$0.0__f2$0.1.buffer is outside of the outermost GPU-block loop. When this buffer didn't get lifted out of the kernel, Profiling wasn't an issue, as the profiler doesn't traverse the IR into GPU loops.

The offending line was:

Halide/src/Profiling.cpp

Line 274 in 461c128

Function func = lookup_function(op->name);

When instrumenting the allocate node. The node is incorrectly determined to be on_stack=true.

This PR checks if there is a custom_new and overrides that it is on the stack to false.

@abadams I wonder if we can't simply rely on Allocate::MemoryType to determine on_stack, or is that still Auto at that moment?

…c groups).

src/Profiling.cpp

Fix injection of GPU buffers that do not go by a Func name (i.e. allo…

8095af6

…c groups).

mcourteaux force-pushed the fix-gpu-alloc-group-profiling branch from baed7a9 to 8095af6 Compare June 28, 2024 11:11

steven-johnson requested a review from halidebuildbots June 28, 2024 16:13

abadams reviewed Jul 3, 2024

View reviewed changes

src/Profiling.cpp Outdated Show resolved Hide resolved

Cleanup

747f879

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

mcourteaux commented Jun 28, 2024

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Are you sure you want to change the base?

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Conversation

mcourteaux commented Jun 28, 2024