Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

mcourteaux
Copy link
Contributor

When --for some reason-- an allocation group for fused storage for multiple Funcs that originally are intended to go in GPUShared gets lifted out of the GPU-block loops, and sits in Heap memory instead, the profiling injection logic assumed that this buffer came from a function with the same name. This buffer was incorrectly determined to be on the stack, as it ignored the custom_new and custom_free attributes of the Allocate node.

Consider this example (also included as a new test):

#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {

    Target t = get_jit_target_from_environment();
    if (!t.has_gpu_feature()) {
        printf("[SKIP] GPU not enabled\n");
        return 0;
    }

    Var x{"x"}, y{"y"};

    Func f1{"f1"}, f2{"f2"};
    f1(x, y) = cast<float>(x + y);
    f2(x, y) = f1(x, y) * 2;

    Func result{"result"};
    result(x, y) = f2(x, y);

    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
    result
        .compute_root()
        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
        .reorder(xi, yi, xo, yo)
        ;

    f2.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    f1.compute_at(result, xo)
        .gpu_threads(x, y)
        .store_in(MemoryType::Heap)
        ;

    result.print_loop_nest();


    t.set_feature(Target::Profile); // Make sure profiling is enabled!
    result.compile_jit(t);

    printf("Success!\n");
    return 0;
}

Produces the following Stmt right before the Profiling pass:

assert(reinterpret<uint64>((struct halide_buffer_t *)result.buffer) != (uint64)0, halide_error_buffer_argument_is_null("result"))
let result = (void *)_halide_buffer_get_host((struct halide_buffer_t *)result.buffer)
let result.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)result.buffer)
let result.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)result.buffer)
let result.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 0)
let result.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 0)
let result.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 0)
let result.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)result.buffer, 1)
let result.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)result.buffer, 1)
let result.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)result.buffer, 1)
if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)result.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)result.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 2, (struct halide_dimension_t *)make_struct((min(result.extent.0, 16) + result.min.0) + -16, max(result.extent.0, 16), 1, 0, (min(result.extent.1, 16) + result.min.1) + -16, max(result.extent.1, 16), max(result.extent.0, 16), 0), (uint64)0)
}
if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)result.buffer)) {
 assert(result.type == (uint32)73730, halide_error_bad_type("Output buffer result", result.type, (uint32)73730))
 assert(result.dimensions == 2, halide_error_bad_dimensions("Output buffer result", result.dimensions, 2))
 assert((16 <= result.extent.0) && (((max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -16) <= (result.extent.0 + result.min.0)), halide_error_access_out_of_bounds("Output buffer result", 0, (min(result.extent.0, 16) + result.min.0) + -16, (max(result.extent.0, 16) + (min(result.extent.0, 16) + result.min.0)) + -17, result.min.0, (result.extent.0 + result.min.0) + -1))
 assert((16 <= result.extent.1) && (((max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -16) <= (result.extent.1 + result.min.1)), halide_error_access_out_of_bounds("Output buffer result", 1, (min(result.extent.1, 16) + result.min.1) + -16, (max(result.extent.1, 16) + (min(result.extent.1, 16) + result.min.1)) + -17, result.min.1, (result.extent.1 + result.min.1) + -1))
 assert(result.stride.0 == 1, halide_error_constraint_violated("result.stride.0", result.stride.0, "1", 1))
 let result.total_extent.1 = int64(result.extent.1)*int64(result.extent.0)
 assert((uint64)abs(int64(result.extent.0)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.0)), (uint64)2147483647))
 assert((uint64)abs(int64(result.extent.1)*int64(result.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("result", (uint64)abs(int64(result.extent.1)*int64(result.stride.1)), (uint64)2147483647))
 assert(result.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("result", result.total_extent.1, (int64)2147483647))
 profiling_enable_instance_marker()
 produce result {
  let halide_copy_to_device_result = halide_copy_to_device((struct halide_buffer_t *)result.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(halide_copy_to_device_result == 0, halide_copy_to_device_result)
  allocate allocgroup__f1$0.0__f2$0.1.buffer[float32 * 1]
   custom_new { let t13 = (struct halide_dimension_t *)make_struct(0, 512, 1, 0, 0, (result.extent.0 + 15)/16, 512, 0, 0, (result.extent.1 + 15)/16, ((result.extent.0 + 15)/16)*512, 0) in (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)alloca(size_of_halide_buffer_t()), t13, reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 2, 32, 3, t13, (uint64)0) }
   custom_delete { halide_device_free_as_destructor(allocgroup__f1$0.0__f2$0.1.buffer); }
  let t14 = halide_device_malloc(allocgroup__f1$0.0__f2$0.1.buffer, (struct halide_device_interface_t const *)halide_cuda_device_interface())
  assert(t14 == 0, t14)
  let allocgroup__f1$0.0__f2$0.1 = (void *)_halide_buffer_get_device(allocgroup__f1$0.0__f2$0.1.buffer)
  gpu_block<CUDA> (result.s0.y.yo.block_id_y, 0, (result.extent.1 + 15)/16) {
   gpu_block<CUDA> (result.s0.x.xo.block_id_x, 0, (result.extent.0 + 15)/16) {
    gpu_thread<CUDA> (.thread_id_y, 0, 16) {
     gpu_thread<CUDA> (.thread_id_x, 0, 16) {
      let result.s0.y.yi.base.s = min(result.s0.y.yo.block_id_y*16, result.extent.1 + -16)
      let result.s0.x.xi.base.s = min(result.s0.x.xo.block_id_x*16, result.extent.0 + -16)
      produce f1$0 {
       allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)] = float32((((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x) + ((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)))
      }
      gpu_thread_barrier(1)
      produce f2$0 {
       consume f1$0 {
        allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256] = allocgroup__f1$0.0__f2$0.1[(((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)]*2.000000f
       }
      }
      gpu_thread_barrier(1)
      consume f2$0 {
       result[((((result.min.1 + result.s0.y.yi.base.s) + .thread_id_y)*result.stride.1) + ((result.min.0 + result.s0.x.xi.base.s) + .thread_id_x)) - ((result.min.1*result.stride.1) + result.min.0)] = allocgroup__f1$0.0__f2$0.1[((((((result.extent.0 + 15)/16)*result.s0.y.yo.block_id_y) + result.s0.x.xo.block_id_x)*512) + ((.thread_id_y*16) + .thread_id_x)) + 256]
      }
     }
    }
   }
  }
  free allocgroup__f1$0.0__f2$0.1.buffer
  _halide_buffer_set_device_dirty((struct halide_buffer_t *)result.buffer, (uint1)1)
 }
}

Notice how the allocgroup__f1$0.0__f2$0.1.buffer is outside of the outermost GPU-block loop. When this buffer didn't get lifted out of the kernel, Profiling wasn't an issue, as the profiler doesn't traverse the IR into GPU loops.

The offending line was:

Function func = lookup_function(op->name);

When instrumenting the allocate node. The node is incorrectly determined to be on_stack=true.

This PR checks if there is a custom_new and overrides that it is on the stack to false.

@abadams I wonder if we can't simply rely on Allocate::MemoryType to determine on_stack, or is that still Auto at that moment?

src/Profiling.cpp Outdated Show resolved Hide resolved
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

2 participants