Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dr.Jit compilation failure when running CUDA variants of mitsuba #803

Closed
memamsaleh opened this issue Jul 12, 2023 · 2 comments
Closed

Dr.Jit compilation failure when running CUDA variants of mitsuba #803

memamsaleh opened this issue Jul 12, 2023 · 2 comments

Comments

@memamsaleh
Copy link

Summary

Cuda variants of mitsuba stopped working (Dr.Jit compilation failure) after upgrading the GPU driver.

System configuration

System information:

OS: Windows-10
CPU: AMD64 Family 25 Model 33 Stepping 2, AuthenticAMD
GPU: NVIDIA GeForce RTX 4090
Python: 3.11.4 | packaged by Anaconda, Inc. | [MSC v.1916 64 bit (AMD64)]
NVidia driver: 536.40
CUDA: 12.2.91
LLVM: -1.-1.-1

Dr.Jit: 0.4.2
Mitsuba: 3.3.0
Is custom build? False
Compiled with: MSVC 19.34.31937.0
Variants:
scalar_rgb
cuda_ad_rgb
cuda_rgb

Description

Mitsuba was working fine until I upgraded my GPU drivers and tried pulling and rebuilding the latest mitsuba. Currently any CUDA based variant results in a Dr.Jit compilation failure when running the code. I tried reverting to older CUDA versions (11.6) however it did not help. Here is the full log:

Critical Dr.Jit compiler failure: jit_optix_compile(): optixModuleGetCompilationState() indicates that the compilation did not complete succesfully. The module's compilation state is: 0x2363
Please see the PTX assembly listing and error message below:

.version 7.4
.target sm_87
.address_size 64

.const .align 8 .b8 params[64];

.entry __raygen__5fcc954341d36b0bc109d39ce0db27bd() {
    .reg.b8   %b <253>; .reg.b16 %w<253>; .reg.b32 %r<253>;
    .reg.b64  %rd<253>; .reg.f32 %f<253>; .reg.f64 %d<253>;
    .reg.pred %p <253>;

    call (%r0), _optix_get_launch_index_x, ();
    ld.const.u32 %r1, [params + 4];
    add.u32 %r0, %r0, %r1;

body:
    ld.const.u64 %rd0, [params+8];
    ldu.global.u64 %rd6, [%rd0];
    mov.pred %p7, 0x1;
    ld.const.u64 %rd0, [params+16];
    mad.wide.u32 %rd0, %r0, 4, %rd0;
    ld.global.cs.f32 %f8, [%rd0];
    ld.const.u64 %rd0, [params+24];
    mad.wide.u32 %rd0, %r0, 4, %rd0;
    ld.global.cs.f32 %f9, [%rd0];
    ld.const.u64 %rd0, [params+32];
    mad.wide.u32 %rd0, %r0, 4, %rd0;
    ld.global.cs.f32 %f10, [%rd0];
    mov.b32 %f11, 0x0;
    mov.b32 %f12, 0xbee4f92e;
    mov.b32 %f13, 0xbf64f92e;
    mov.b32 %f14, 0x7f7fffff;
    mov.b32 %r15, 0xff;
    mov.b32 %r16, 0x0;
    mov.b32 %r17, 0x1;
    .reg.u32 %u18_out_<32>;
    .reg.u32 %u18_payload_type, %u18_payload_count;
    mov.u32 %u18_payload_type, 0;
    mov.u32 %u18_payload_count, 6;
    call (%u18_out_0, %u18_out_1, %u18_out_2, %u18_out_3, %u18_out_4, %u18_out_5, %u18_out_6, %u18_out_7, %u18_out_8, %u18_out_9, %u18_out_10, %u18_out_11, %u18_out_12, %u18_out_13, %u18_out_14, %u18_out_15, %u18_out_16, %u18_out_17, %u18_out_18, %u18_out_19, %u18_out_20, %u18_out_21, %u18_out_22, %u18_out_23, %u18_out_24, %u18_out_25, %u18_out_26, %u18_out_27, %u18_out_28, %u18_out_29, %u18_out_30, %u18_out_31), _optix_trace_typed_32, (%u18_payload_type, %rd6, %f8, %f9, %f10, %f11, %f12, %f13, %f11, %f14, %f11, %r15, %r16, %r16, %r17, %r16, %u18_payload_count, %r16, %r16, %r16, %r16, %r16, %r16, %u18_out_6, %u18_out_7, %u18_out_8, %u18_out_9, %u18_out_10, %u18_out_11, %u18_out_12, %u18_out_13, %u18_out_14, %u18_out_15, %u18_out_16, %u18_out_17, %u18_out_18, %u18_out_19, %u18_out_20, %u18_out_21, %u18_out_22, %u18_out_23, %u18_out_24, %u18_out_25, %u18_out_26, %u18_out_27, %u18_out_28, %u18_out_29, %u18_out_30, %u18_out_31);
    mov.b32 %r19, %u18_out_0;
    mov.b32 %f20, %r19;
    mov.b32 %f21, 0x7f800000;
    setp.ne.f32 %p22, %f20, %f21;
    not.pred %p23, %p22;
    mov.b32 %r24, %u18_out_5;
    selp.u32 %r25, %r16, %r24, %p23;
    setp.eq.u32 %p26, %r25, %r16;
    mov.b32 %r27, %u18_out_4;
    selp.u32 %r28, %r16, %r27, %p23;
    selp.u32 %r29, %r28, %r25, %p26;
    mov.b32 %r30, %u18_out_1;
    mov.b32 %f31, %r30;
    mov.b32 %r32, %u18_out_2;
    mov.b32 %f33, %r32;
    mov.b32 %r34, %u18_out_3;
    mov.b32 %r35, 0x0;
    setp.ne.u32 %p36, %r29, %r35;
    and.pred %p37, %p22, %p36;
    ld.const.u64 %rd38, [params+40];
    ld.const.u64 %rd39, [params+48];

    @!%p37 bra l_masked_40;

    { // VCall: mitsuba::Shape::compute_surface_interaction()

        mad.wide.u32 %rd3, %r29, 8, %rd38;
        ld.global.u64 %rd3, [%rd3];
        cvt.u32.u64 %r3, %rd3;
        call (%rd2), _optix_call_direct_callable, (%r3);
        shr.u64 %rd3, %rd3, 32;
        add.u64 %rd3, %rd3, %rd39;

        {
            proto: .callprototype (.param .align 4 .b8 result[76]) _(.reg .u32 self, .reg .u64 data, .param .align 4 .b8 params[28]);
            .param .align 4 .b8 out[76];
            .param .align 4 .b8 in[28];
            st.param.f32 [in+0], %f8;
            st.param.f32 [in+4], %f9;
            st.param.f32 [in+8], %f10;
            st.param.f32 [in+12], %f20;
            st.param.f32 [in+16], %f31;
            st.param.f32 [in+20], %f33;
            st.param.u32 [in+24], %r34;
            call (out), %rd2, (%r29, %rd3, in), proto;
            ld.param.f32 %f41, [out+0];
            ld.param.f32 %f45, [out+4];
            ld.param.f32 %f51, [out+8];
            ld.param.f32 %f52, [out+12];
            ld.param.f32 %f53, [out+16];
            ld.param.f32 %f50, [out+20];
            ld.param.f32 %f49, [out+24];
            ld.param.f32 %f42, [out+28];
            ld.param.f32 %f46, [out+44];
            ld.param.f32 %f44, [out+48];
            ld.param.f32 %f48, [out+52];
            ld.param.f32 %f43, [out+56];
            ld.param.f32 %f47, [out+60];
        }

        bra.uni l_done_40;
    }

l_masked_40:
    mov.b32 %f41, 0;
    mov.b32 %f45, 0;
    mov.b32 %f51, 0;
    mov.b32 %f52, 0;
    mov.b32 %f53, 0;
    mov.b32 %f50, 0;
    mov.b32 %f49, 0;
    mov.b32 %f42, 0;
    mov.b32 %f46, 0;
    mov.b32 %f44, 0;
    mov.b32 %f48, 0;
    mov.b32 %f43, 0;
    mov.b32 %f47, 0;

l_done_40:
    mov.b32 %f54, 0x0;
    not.pred %p55, %p22;
    mov.b32 %f56, 0x7f800000;
    selp.f32 %f57, %f56, %f41, %p55;
    mov.b32 %r58, 0x0;
    mov.b64 %rd59, 0x0;
    mov.b64 %rd60, 0x5851f42d4c957f2d;
    mov.b64 %rd61, 0xda3e39cb94b95bdb;
    mov.u32 %r62, %r0;
    cvt.u64.u32 %rd63, %r62;
    add.u64 %rd64, %rd61, %rd63;
    mov.b64 %rd65, 0x1;
    cvt.u32.u64 %r3, %rd65;
    shl.b64 %rd66, %rd64, %r3;
    or.b64 %rd67, %rd66, %rd65;
    mad.lo.u64 %rd68, %rd59, %rd60, %rd67;
    mov.b64 %rd69, 0x853c49e6748fea9b;
    add.u64 %rd70, %rd68, %rd69;
    mad.lo.u64 %rd71, %rd70, %rd60, %rd67;
    mov.b32 %f72, 0x0;
    setp.eq.f32 %p73, %f45, %f72;
    setp.eq.f32 %p74, %f46, %f72;
    and.pred %p75, %p73, %p74;
    setp.eq.f32 %p76, %f47, %f72;
    and.pred %p77, %p75, %p76;
    setp.ge.f32 %p78, %f43, %f72;
    mul.ftz.f32 %f79, %f44, %f48;
    mov.b32 %f80, 0x3f800000;
    mov.b32 %f81, 0xbf800000;
    selp.f32 %f82, %f80, %f81, %p78;
    add.ftz.f32 %f83, %f82, %f43;
    rcp.approx.ftz.f32 %f84, %f83;
    neg.ftz.f32 %f85, %f84;
    mul.ftz.f32 %f86, %f79, %f85;
    neg.ftz.f32 %f87, %f86;
    selp.f32 %f88, %f86, %f87, %p78;
    mul.ftz.f32 %f89, %f44, %f45;
    fma.rn.ftz.f32 %f90, %f48, %f46, %f89;
    fma.rn.ftz.f32 %f91, %f43, %f47, %f90;
    neg.ftz.f32 %f92, %f91;
    fma.rn.ftz.f32 %f93, %f48, %f92, %f46;
    fma.rn.ftz.f32 %f94, %f43, %f92, %f47;
    fma.rn.ftz.f32 %f95, %f44, %f92, %f45;
    mul.ftz.f32 %f96, %f95, %f95;
    fma.rn.ftz.f32 %f97, %f93, %f93, %f96;
    fma.rn.ftz.f32 %f98, %f94, %f94, %f97;
    rsqrt.approx.ftz.f32 %f99, %f98;
    mul.ftz.f32 %f100, %f93, %f99;
    selp.f32 %f101, %f88, %f100, %p77;
    mul.ftz.f32 %f102, %f44, %f44;
    mul.ftz.f32 %f103, %f102, %f85;
    neg.ftz.f32 %f104, %f103;
    selp.f32 %f105, %f103, %f104, %p78;
    add.ftz.f32 %f106, %f105, %f80;
    mul.ftz.f32 %f107, %f95, %f99;
    selp.f32 %f108, %f106, %f107, %p77;
    mul.ftz.f32 %f109, %f48, %f108;
    neg.ftz.f32 %f110, %f109;
    fma.rn.ftz.f32 %f111, %f44, %f101, %f110;
    neg.ftz.f32 %f112, %f44;
    selp.f32 %f113, %f112, %f44, %p78;
    mul.ftz.f32 %f114, %f94, %f99;
    selp.f32 %f115, %f113, %f114, %p77;
    mul.ftz.f32 %f116, %f44, %f115;
    neg.ftz.f32 %f117, %f116;
    fma.rn.ftz.f32 %f118, %f43, %f108, %f117;
    mul.ftz.f32 %f119, %f43, %f101;
    neg.ftz.f32 %f120, %f119;
    fma.rn.ftz.f32 %f121, %f48, %f115, %f120;
    mov.f32 %f122, %f54;
    mov.u32 %r123, %r58;
    mov.u64 %rd124, %rd71;

l_125_cond: // Loop ()
    mov.b32 %f126, 0x7f800000;
    setp.ne.f32 %p127, %f57, %f126;
    mov.b32 %r128, 0x100;
    setp.lo.u32 %p129, %r123, %r128;
    and.pred %p130, %p127, %p129;
    @!%p130 bra l_125_done;

l_125_body:
    mov.u64 %rd132, %rd124;
    mov.u32 %r133, %r123;
    mov.f32 %f134, %f122;
    mov.b64 %rd135, 0x5851f42d4c957f2d;
    mad.lo.u64 %rd136, %rd132, %rd135, %rd67;
    mad.lo.u64 %rd137, %rd136, %rd135, %rd67;
    mov.b32 %r138, 0x1;
    add.u32 %r139, %r133, %r138;
    mov.pred %p140, 0x1;
    mov.b32 %f141, 0x3f800000;
    mov.b32 %f142, 0x40000000;
    mov.b64 %rd143, 0x12;
    cvt.u32.u64 %r3, %rd143;
    shr.u64 %rd144, %rd132, %r3;
    xor.b64 %rd145, %rd144, %rd132;
    mov.b64 %rd146, 0x1b;
    cvt.u32.u64 %r3, %rd146;
    shr.u64 %rd147, %rd145, %r3;
    cvt.u32.u64 %r148, %rd147;
    mov.b64 %rd149, 0x3b;
    cvt.u32.u64 %r3, %rd149;
    shr.u64 %rd150, %rd132, %r3;
    cvt.u32.u64 %r151, %rd150;
    shr.u32 %r152, %r148, %r151;
    mov.b32 %r153, %r151;
    neg.s32 %r154, %r153;
    mov.b32 %r155, 0x1f;
    and.b32 %r156, %r154, %r155;
    mov.b32 %r157, %r156;
    shl.b32 %r158, %r148, %r157;
    or.b32 %r159, %r152, %r158;
    mov.b32 %r160, 0x9;
    shr.u32 %r161, %r159, %r160;
    mov.b32 %r162, 0x3f800000;
    or.b32 %r163, %r161, %r162;
    mov.b32 %f164, %r163;
    sub.ftz.f32 %f165, %f164, %f141;
    mov.b32 %f166, 0xbf800000;
    fma.rn.ftz.f32 %f167, %f142, %f165, %f166;
    abs.f32 %f168, %f167;
    cvt.u32.u64 %r3, %rd143;
    shr.u64 %rd169, %rd136, %r3;
    xor.b64 %rd170, %rd169, %rd136;
    cvt.u32.u64 %r3, %rd146;
    shr.u64 %rd171, %rd170, %r3;
    cvt.u32.u64 %r172, %rd171;
    cvt.u32.u64 %r3, %rd149;
    shr.u64 %rd173, %rd136, %r3;
    cvt.u32.u64 %r174, %rd173;
    shr.u32 %r175, %r172, %r174;
    mov.b32 %r176, %r174;
    neg.s32 %r177, %r176;
    and.b32 %r178, %r177, %r155;
    mov.b32 %r179, %r178;
    shl.b32 %r180, %r172, %r179;
    or.b32 %r181, %r175, %r180;
    shr.u32 %r182, %r181, %r160;
    or.b32 %r183, %r182, %r162;
    mov.b32 %f184, %r183;
    sub.ftz.f32 %f185, %f184, %f141;
    fma.rn.ftz.f32 %f186, %f142, %f185, %f166;
    abs.f32 %f187, %f186;
    setp.lt.f32 %p188, %f168, %f187;
    selp.f32 %f189, %f186, %f167, %p188;
    mov.b32 %f190, 0x0;
    setp.eq.f32 %p191, %f167, %f190;
    setp.eq.f32 %p192, %f186, %f190;
    and.pred %p193, %p191, %p192;
    mov.b32 %f194, 0x3fc90fdb;
    mov.b32 %f195, 0x3f490fdb;
    selp.f32 %f196, %f167, %f186, %p188;
    mul.ftz.f32 %f197, %f195, %f196;
    div.approx.ftz.f32 %f198, %f197, %f189;
    sub.ftz.f32 %f199, %f194, %f198;
    selp.f32 %f200, %f199, %f198, %p188;
    selp.f32 %f201, %f190, %f200, %p193;
    sin.approx.ftz.f32 %f202, %f201;
    mul.ftz.f32 %f203, %f189, %f202;
    cos.approx.ftz.f32 %f204, %f201;
    mul.ftz.f32 %f205, %f189, %f204;
    mul.ftz.f32 %f206, %f205, %f205;
    fma.rn.ftz.f32 %f207, %f203, %f203, %f206;
    sub.ftz.f32 %f208, %f141, %f207;
    add.ftz.f32 %f209, %f208, %f141;
    sqrt.approx.ftz.f32 %f210, %f209;
    mul.ftz.f32 %f211, %f203, %f210;
    mul.ftz.f32 %f212, %f205, %f210;
    mul.ftz.f32 %f213, %f115, %f212;
    fma.rn.ftz.f32 %f214, %f111, %f211, %f213;
    fma.rn.ftz.f32 %f215, %f43, %f208, %f214;
    mul.ftz.f32 %f216, %f101, %f212;
    fma.rn.ftz.f32 %f217, %f118, %f211, %f216;
    fma.rn.ftz.f32 %f218, %f48, %f208, %f217;
    mul.ftz.f32 %f219, %f108, %f212;
    fma.rn.ftz.f32 %f220, %f121, %f211, %f219;
    fma.rn.ftz.f32 %f221, %f44, %f208, %f220;
    mul.ftz.f32 %f222, %f50, %f221;
    fma.rn.ftz.f32 %f223, %f49, %f218, %f222;
    fma.rn.ftz.f32 %f224, %f42, %f215, %f223;
    setp.ge.f32 %p225, %f224, %f190;
    abs.f32 %f226, %f51;
    abs.f32 %f227, %f52;
    max.ftz.f32 %f228, %f226, %f227;
    abs.f32 %f229, %f53;
    max.ftz.f32 %f230, %f228, %f229;
    add.ftz.f32 %f231, %f141, %f230;
    mov.b32 %f232, 0x38bb8000;
    mul.ftz.f32 %f233, %f231, %f232;
    neg.ftz.f32 %f234, %f233;
    selp.f32 %f235, %f233, %f234, %p225;
    fma.rn.ftz.f32 %f236, %f235, %f50, %f51;
    fma.rn.ftz.f32 %f237, %f235, %f49, %f52;
    fma.rn.ftz.f32 %f238, %f235, %f42, %f53;
    mov.b32 %f239, 0x3f400000;
    mov.b32 %r240, 0xff;
    mov.b32 %r241, 0xc;
    mov.b32 %r242, 0x0;
    .reg.u32 %u243_out_<32>;
    .reg.u32 %u243_payload_type, %u243_payload_count;
    mov.u32 %u243_payload_type, 0;
    mov.u32 %u243_payload_count, 1;
    call (%u243_out_0, %u243_out_1, %u243_out_2, %u243_out_3, %u243_out_4, %u243_out_5, %u243_out_6, %u243_out_7, %u243_out_8, %u243_out_9, %u243_out_10, %u243_out_11, %u243_out_12, %u243_out_13, %u243_out_14, %u243_out_15, %u243_out_16, %u243_out_17, %u243_out_18, %u243_out_19, %u243_out_20, %u243_out_21, %u243_out_22, %u243_out_23, %u243_out_24, %u243_out_25, %u243_out_26, %u243_out_27, %u243_out_28, %u243_out_29, %u243_out_30, %u243_out_31), _optix_trace_typed_32, (%u243_payload_type, %rd6, %f236, %f237, %f238, %f221, %f218, %f215, %f190, %f239, %f11, %r240, %r241, %r242, %r138, %r242, %u243_payload_count, %r138, %u243_out_1, %u243_out_2, %u243_out_3, %u243_out_4, %u243_out_5, %u243_out_6, %u243_out_7, %u243_out_8, %u243_out_9, %u243_out_10, %u243_out_11, %u243_out_12, %u243_out_13, %u243_out_14, %u243_out_15, %u243_out_16, %u243_out_17, %u243_out_18, %u243_out_19, %u243_out_20, %u243_out_21, %u243_out_22, %u243_out_23, %u243_out_24, %u243_out_25, %u243_out_26, %u243_out_27, %u243_out_28, %u243_out_29, %u243_out_30, %u243_out_31);
    mov.b32 %r244, %u243_out_0;
    setp.eq.u32 %p245, %r244, %r138;
    not.pred %p246, %p245;
    add.ftz.f32 %f247, %f134, %f141;
    selp.f32 %f248, %f247, %f134, %p246;
    mov.u64 %rd124, %rd137;
    mov.u32 %r123, %r139;
    mov.f32 %f122, %f248;
    bra l_125_cond;

l_125_done:
    mov.f32 %f250, %f122;
    mov.b32 %f251, 0x3b800000;
    mul.ftz.f32 %f252, %f250, %f251;
    ld.const.u64 %rd0, [params+56];
    mad.wide.u32 %rd0, %r0, 4, %rd0;
    st.global.cs.f32 [%rd0], %f252;
    ret;
}

.visible .func (.param .align 4 .b8 result[76]) __direct_callable__149725e9eb059a10d3b82a118e59783d(.reg .u32 self, .reg .u64 data, .param .align 4 .b8 params[28]) {
    // VCall: mitsuba::Shape::compute_surface_interaction()
    .reg.b8   %b <182>; .reg.b16 %w<182>; .reg.b32 %r<182>;
    .reg.b64  %rd<182>; .reg.f32 %f<182>; .reg.f64 %d<182>;
    .reg.pred %p <182>;

    ld.global.f32 %f4, [data+60];
    ld.global.f32 %f5, [data+64];
    ld.global.f32 %f6, [data+28];
    ld.global.f32 %f7, [data+32];
    ld.global.f32 %f8, [data+36];
    ld.global.f32 %f9, [data+40];
    ld.global.f32 %f10, [data+68];
    ld.global.f32 %f11, [data+12];
    ld.global.f32 %f12, [data+16];
    ld.global.f32 %f13, [data+20];
    ld.global.f32 %f14, [data+24];
    ld.global.f32 %f15, [data+0];
    ld.global.f32 %f16, [data+4];
    ld.global.f32 %f17, [data+8];
    ld.global.f32 %f18, [data+44];
    ld.global.f32 %f19, [data+48];
    ld.global.f32 %f20, [data+52];
    ld.global.f32 %f21, [data+56];
    ld.global.f32 %f22, [data+72];
    ld.global.f32 %f23, [data+76];
    ld.global.f32 %f24, [data+80];
    ld.global.f32 %f25, [data+84];
    ld.global.f32 %f26, [data+88];
    ld.global.f32 %f27, [data+92];
    ld.param.f32 %f28, [params+12];
    mov.b32 %f29, 0x0;
    mov.b32 %f30, 0xbf64f92e;
    ld.param.f32 %f31, [params+8];
    fma.rn.ftz.f32 %f32, %f30, %f28, %f31;
    mov.b32 %f33, 0xbee4f92e;
    ld.param.f32 %f34, [params+4];
    fma.rn.ftz.f32 %f35, %f33, %f28, %f34;
    ld.param.f32 %f36, [params+0];
    fma.rn.ftz.f32 %f37, %f29, %f28, %f36;
    fma.rn.ftz.f32 %f38, %f8, %f37, %f9;
    fma.rn.ftz.f32 %f39, %f7, %f35, %f38;
    fma.rn.ftz.f32 %f40, %f6, %f32, %f39;
    fma.rn.ftz.f32 %f41, %f13, %f37, %f14;
    fma.rn.ftz.f32 %f42, %f12, %f35, %f41;
    fma.rn.ftz.f32 %f43, %f11, %f32, %f42;
    neg.ftz.f32 %f44, %f43;
    mul.ftz.f32 %f45, %f10, %f44;
    fma.rn.ftz.f32 %f46, %f5, %f40, %f45;
    fma.rn.ftz.f32 %f47, %f4, %f29, %f46;
    mov.b32 %f48, 0x40c90fdb;
    mul.ftz.f32 %f49, %f47, %f48;
    sub.ftz.f32 %f50, %f37, %f15;
    sub.ftz.f32 %f51, %f32, %f16;
    sub.ftz.f32 %f52, %f35, %f17;
    mul.ftz.f32 %f53, %f50, %f50;
    fma.rn.ftz.f32 %f54, %f52, %f52, %f53;
    fma.rn.ftz.f32 %f55, %f51, %f51, %f54;
    rsqrt.approx.ftz.f32 %f56, %f55;
    mul.ftz.f32 %f57, %f50, %f56;
    mul.ftz.f32 %f58, %f52, %f56;
    mul.ftz.f32 %f59, %f51, %f56;
    mov.u32 %r60, self;
    setp.lt.f32 %p61, %f43, %f29;
    setp.lt.f32 %p62, %f40, %f29;
    mov.b32 %f63, 0x40490fdb;
    abs.f32 %f64, %f43;
    abs.f32 %f65, %f40;
    setp.gt.f32 %p66, %f64, %f65;
    mov.b32 %f67, 0x3fc90fdb;
    min.ftz.f32 %f68, %f64, %f65;
    max.ftz.f32 %f69, %f65, %f64;
    div.approx.ftz.f32 %f70, %f68, %f69;
    mul.ftz.f32 %f71, %f70, %f70;
    mul.ftz.f32 %f72, %f71, %f71;
    mul.ftz.f32 %f73, %f72, %f72;
    mov.b32 %f74, 0x3c00cd04;
    mov.b32 %f75, 0xbd179428;
    mov.b32 %f76, 0x3dabc06d;
    fma.rn.ftz.f32 %f77, %f71, %f75, %f76;
    fma.rn.ftz.f32 %f78, %f72, %f74, %f77;
    mov.b32 %f79, 0xbe0a1a99;
    mov.b32 %f80, 0x3e4b95bf;
    fma.rn.ftz.f32 %f81, %f71, %f79, %f80;
    mov.b32 %f82, 0xbeaaa1b5;
    mov.b32 %f83, 0x3f7ffff5;
    fma.rn.ftz.f32 %f84, %f71, %f82, %f83;
    fma.rn.ftz.f32 %f85, %f72, %f81, %f84;
    fma.rn.ftz.f32 %f86, %f73, %f78, %f85;
    mul.ftz.f32 %f87, %f86, %f70;
    sub.ftz.f32 %f88, %f67, %f87;
    selp.f32 %f89, %f88, %f87, %p66;
    sub.ftz.f32 %f90, %f63, %f89;
    selp.f32 %f91, %f90, %f89, %p62;
    neg.ftz.f32 %f92, %f91;
    selp.f32 %f93, %f92, %f91, %p61;
    setp.ne.f32 %p94, %f69, %f29;
    selp.b32 %f95, %f93, 0, %p94;
    setp.lt.f32 %p96, %f95, %f29;
    add.ftz.f32 %f97, %f95, %f48;
    selp.f32 %f98, %f97, %f95, %p96;
    mov.b32 %f99, 0x3e22f983;
    mul.ftz.f32 %f100, %f98, %f99;
    fma.rn.ftz.f32 %f101, %f20, %f37, %f21;
    fma.rn.ftz.f32 %f102, %f19, %f35, %f101;
    fma.rn.ftz.f32 %f103, %f18, %f32, %f102;
    setp.ge.f32 %p104, %f103, %f29;
    mov.b32 %f105, 0x3f000000;
    mov.b32 %f106, 0x3f800000;
    mov.b32 %f107, 0xbf800000;
    selp.f32 %f108, %f106, %f107, %p104;
    sub.ftz.f32 %f109, %f103, %f108;
    mul.ftz.f32 %f110, %f40, %f40;
    fma.rn.ftz.f32 %f111, %f43, %f43, %f110;
    fma.rn.ftz.f32 %f112, %f109, %f109, %f111;
    sqrt.approx.ftz.f32 %f113, %f112;
    mul.ftz.f32 %f114, %f105, %f113;
    abs.f32 %f115, %f114;
    setp.gt.f32 %p116, %f115, %f105;
    sub.ftz.f32 %f117, %f106, %f115;
    mul.ftz.f32 %f118, %f105, %f117;
    mul.ftz.f32 %f119, %f114, %f114;
    selp.f32 %f120, %f118, %f119, %p116;
    mul.ftz.f32 %f121, %f120, %f120;
    mul.ftz.f32 %f122, %f121, %f121;
    mov.b32 %f123, 0x3d2cb352;
    mov.b32 %f124, 0x3cc617e3;
    mov.b32 %f125, 0x3d3a3ec7;
    fma.rn.ftz.f32 %f126, %f120, %f124, %f125;
    mov.b32 %f127, 0x3d9980f6;
    mov.b32 %f128, 0x3e2aaae4;
    fma.rn.ftz.f32 %f129, %f120, %f127, %f128;
    fma.rn.ftz.f32 %f130, %f121, %f126, %f129;
    fma.rn.ftz.f32 %f131, %f122, %f123, %f130;
    sqrt.approx.ftz.f32 %f132, %f118;
    selp.f32 %f133, %f132, %f115, %p116;
    mul.ftz.f32 %f134, %f120, %f133;
    fma.rn.ftz.f32 %f135, %f131, %f134, %f133;
    add.ftz.f32 %f136, %f135, %f135;
    sub.ftz.f32 %f137, %f67, %f136;
    selp.f32 %f138, %f137, %f135, %p116;
    abs.f32 %f139, %f138;
    mov.b32 %f140, 0x80000000;
    and.b32 %f141, %f140, %f114;
    or.b32 %f142, %f139, %f141;
    mov.b32 %f143, 0x40000000;
    mul.ftz.f32 %f144, %f142, %f143;
    sub.ftz.f32 %f145, %f63, %f144;
    selp.f32 %f146, %f144, %f145, %p104;
    mov.b32 %f147, 0x3ea2f983;
    mul.ftz.f32 %f148, %f146, %f147;
    mul.ftz.f32 %f149, %f24, %f44;
    fma.rn.ftz.f32 %f150, %f23, %f40, %f149;
    fma.rn.ftz.f32 %f151, %f22, %f29, %f150;
    mul.ftz.f32 %f152, %f151, %f48;
    mul.ftz.f32 %f153, %f27, %f44;
    fma.rn.ftz.f32 %f154, %f26, %f40, %f153;
    fma.rn.ftz.f32 %f155, %f25, %f29, %f154;
    mul.ftz.f32 %f156, %f155, %f48;
    mul.ftz.f32 %f157, %f43, %f43;
    add.ftz.f32 %f158, %f110, %f157;
    sqrt.approx.ftz.f32 %f159, %f158;
    setp.eq.f32 %p160, %f159, %f29;
    neg.ftz.f32 %f161, %f159;
    selp.f32 %f162, %f29, %f161, %p160;
    rcp.approx.ftz.f32 %f163, %f159;
    mul.ftz.f32 %f164, %f43, %f163;
    mul.ftz.f32 %f165, %f103, %f164;
    selp.f32 %f166, %f29, %f165, %p160;
    mul.ftz.f32 %f167, %f40, %f163;
    mul.ftz.f32 %f168, %f103, %f167;
    selp.f32 %f169, %f106, %f168, %p160;
    mul.ftz.f32 %f170, %f10, %f169;
    fma.rn.ftz.f32 %f171, %f5, %f166, %f170;
    fma.rn.ftz.f32 %f172, %f4, %f162, %f171;
    mul.ftz.f32 %f173, %f172, %f63;
    mul.ftz.f32 %f174, %f24, %f169;
    fma.rn.ftz.f32 %f175, %f23, %f166, %f174;
    fma.rn.ftz.f32 %f176, %f22, %f162, %f175;
    mul.ftz.f32 %f177, %f176, %f63;
    mul.ftz.f32 %f178, %f27, %f169;
    fma.rn.ftz.f32 %f179, %f26, %f166, %f178;
    fma.rn.ftz.f32 %f180, %f25, %f162, %f179;
    mul.ftz.f32 %f181, %f180, %f63;
    st.param.f32 [result+0], %f28;
    st.param.f32 [result+4], %f49;
    st.param.f32 [result+8], %f37;
    st.param.f32 [result+12], %f35;
    st.param.f32 [result+16], %f32;
    st.param.f32 [result+20], %f57;
    st.param.f32 [result+24], %f58;
    st.param.f32 [result+28], %f59;
    st.param.u32 [result+32], %r60;
    st.param.f32 [result+36], %f100;
    st.param.f32 [result+40], %f148;
    st.param.f32 [result+44], %f152;
    st.param.f32 [result+48], %f57;
    st.param.f32 [result+52], %f58;
    st.param.f32 [result+56], %f59;
    st.param.f32 [result+60], %f156;
    st.param.f32 [result+64], %f173;
    st.param.f32 [result+68], %f177;
    st.param.f32 [result+72], %f181;
    ret;
}

.visible .func (.param .align 4 .b8 result[76]) __direct_callable__a41f2f95c1c81c35fbe079af8d8dc018(.reg .u32 self, .reg .u64 data, .param .align 4 .b8 params[28]) {
    // VCall: mitsuba::Shape::compute_surface_interaction()
    .reg.b8   %b <120>; .reg.b16 %w<120>; .reg.b32 %r<120>;
    .reg.b64  %rd<120>; .reg.f32 %f<120>; .reg.f64 %d<120>;
    .reg.pred %p <120>;

    ld.param.f32 %f4, [params+12];
    ld.global.u64 %rd5, [data+0];
    ld.global.u64 %rd6, [data+8];
    ld.param.u32 %r7, [params+24];
    mov.b32 %r8, 0x3;
    mul.lo.u32 %r9, %r7, %r8;
    mov.b32 %r10, 0x1;
    add.u32 %r11, %r9, %r10;
    mov.pred %p12, 0x1;
    mad.wide.u32 %rd3, %r11, 4, %rd6;
    ld.global.nc.u32 %r13, [%rd3];
    mul.lo.u32 %r14, %r13, %r8;
    mad.wide.u32 %rd3, %r14, 4, %rd5;
    ld.global.nc.f32 %f15, [%rd3];
    mad.wide.u32 %rd3, %r9, 4, %rd6;
    ld.global.nc.u32 %r16, [%rd3];
    mul.lo.u32 %r17, %r16, %r8;
    mad.wide.u32 %rd3, %r17, 4, %rd5;
    ld.global.nc.f32 %f18, [%rd3];
    sub.ftz.f32 %f19, %f15, %f18;
    mov.b32 %r20, 0x2;
    add.u32 %r21, %r9, %r20;
    mad.wide.u32 %rd3, %r21, 4, %rd6;
    ld.global.nc.u32 %r22, [%rd3];
    mul.lo.u32 %r23, %r22, %r8;
    add.u32 %r24, %r23, %r10;
    mad.wide.u32 %rd3, %r24, 4, %rd5;
    ld.global.nc.f32 %f25, [%rd3];
    add.u32 %r26, %r17, %r10;
    mad.wide.u32 %rd3, %r26, 4, %rd5;
    ld.global.nc.f32 %f27, [%rd3];
    sub.ftz.f32 %f28, %f25, %f27;
    add.u32 %r29, %r14, %r10;
    mad.wide.u32 %rd3, %r29, 4, %rd5;
    ld.global.nc.f32 %f30, [%rd3];
    sub.ftz.f32 %f31, %f30, %f27;
    mad.wide.u32 %rd3, %r23, 4, %rd5;
    ld.global.nc.f32 %f32, [%rd3];
    sub.ftz.f32 %f33, %f32, %f18;
    mul.ftz.f32 %f34, %f31, %f33;
    neg.ftz.f32 %f35, %f34;
    fma.rn.ftz.f32 %f36, %f19, %f28, %f35;
    add.u32 %r37, %r14, %r20;
    mad.wide.u32 %rd3, %r37, 4, %rd5;
    ld.global.nc.f32 %f38, [%rd3];
    add.u32 %r39, %r17, %r20;
    mad.wide.u32 %rd3, %r39, 4, %rd5;
    ld.global.nc.f32 %f40, [%rd3];
    sub.ftz.f32 %f41, %f38, %f40;
    add.u32 %r42, %r23, %r20;
    mad.wide.u32 %rd3, %r42, 4, %rd5;
    ld.global.nc.f32 %f43, [%rd3];
    sub.ftz.f32 %f44, %f43, %f40;
    mul.ftz.f32 %f45, %f19, %f44;
    neg.ftz.f32 %f46, %f45;
    fma.rn.ftz.f32 %f47, %f41, %f33, %f46;
    mul.ftz.f32 %f48, %f41, %f28;
    neg.ftz.f32 %f49, %f48;
    fma.rn.ftz.f32 %f50, %f31, %f44, %f49;
    mul.ftz.f32 %f51, %f50, %f50;
    fma.rn.ftz.f32 %f52, %f47, %f47, %f51;
    fma.rn.ftz.f32 %f53, %f36, %f36, %f52;
    rsqrt.approx.ftz.f32 %f54, %f53;
    mul.ftz.f32 %f55, %f36, %f54;
    mov.b32 %f56, 0x0;
    setp.ge.f32 %p57, %f55, %f56;
    mul.ftz.f32 %f58, %f50, %f54;
    mul.ftz.f32 %f59, %f58, %f58;
    mov.b32 %f60, 0x3f800000;
    mov.b32 %f61, 0xbf800000;
    selp.f32 %f62, %f60, %f61, %p57;
    add.ftz.f32 %f63, %f62, %f55;
    rcp.approx.ftz.f32 %f64, %f63;
    neg.ftz.f32 %f65, %f64;
    mul.ftz.f32 %f66, %f59, %f65;
    neg.ftz.f32 %f67, %f66;
    selp.f32 %f68, %f66, %f67, %p57;
    add.ftz.f32 %f69, %f68, %f60;
    ld.param.f32 %f70, [params+16];
    sub.ftz.f32 %f71, %f60, %f70;
    ld.param.f32 %f72, [params+20];
    sub.ftz.f32 %f73, %f71, %f72;
    mul.ftz.f32 %f74, %f32, %f72;
    fma.rn.ftz.f32 %f75, %f15, %f70, %f74;
    fma.rn.ftz.f32 %f76, %f18, %f73, %f75;
    mul.ftz.f32 %f77, %f25, %f72;
    fma.rn.ftz.f32 %f78, %f30, %f70, %f77;
    fma.rn.ftz.f32 %f79, %f27, %f73, %f78;
    mul.ftz.f32 %f80, %f43, %f72;
    fma.rn.ftz.f32 %f81, %f38, %f70, %f80;
    fma.rn.ftz.f32 %f82, %f40, %f73, %f81;
    mul.ftz.f32 %f83, %f47, %f54;
    mov.u32 %r84, self;
    mul.ftz.f32 %f85, %f58, %f83;
    mul.ftz.f32 %f86, %f85, %f65;
    neg.ftz.f32 %f87, %f86;
    selp.f32 %f88, %f86, %f87, %p57;
    ld.global.u64 %rd89, [data+16];
    mad.wide.u32 %rd3, %r23, 4, %rd89;
    ld.global.nc.f32 %f90, [%rd3];
    mad.wide.u32 %rd3, %r14, 4, %rd89;
    ld.global.nc.f32 %f91, [%rd3];
    mad.wide.u32 %rd3, %r17, 4, %rd89;
    ld.global.nc.f32 %f92, [%rd3];
    mul.ftz.f32 %f93, %f92, %f73;
    fma.rn.ftz.f32 %f94, %f91, %f70, %f93;
    fma.rn.ftz.f32 %f95, %f90, %f72, %f94;
    mad.wide.u32 %rd3, %r42, 4, %rd89;
    ld.global.nc.f32 %f96, [%rd3];
    mad.wide.u32 %rd3, %r37, 4, %rd89;
    ld.global.nc.f32 %f97, [%rd3];
    mad.wide.u32 %rd3, %r39, 4, %rd89;
    ld.global.nc.f32 %f98, [%rd3];
    mul.ftz.f32 %f99, %f98, %f73;
    fma.rn.ftz.f32 %f100, %f97, %f70, %f99;
    fma.rn.ftz.f32 %f101, %f96, %f72, %f100;
    mad.wide.u32 %rd3, %r24, 4, %rd89;
    ld.global.nc.f32 %f102, [%rd3];
    mad.wide.u32 %rd3, %r29, 4, %rd89;
    ld.global.nc.f32 %f103, [%rd3];
    mad.wide.u32 %rd3, %r26, 4, %rd89;
    ld.global.nc.f32 %f104, [%rd3];
    mul.ftz.f32 %f105, %f104, %f73;
    fma.rn.ftz.f32 %f106, %f103, %f70, %f105;
    fma.rn.ftz.f32 %f107, %f102, %f72, %f106;
    mul.ftz.f32 %f108, %f95, %f95;
    fma.rn.ftz.f32 %f109, %f107, %f107, %f108;
    fma.rn.ftz.f32 %f110, %f101, %f101, %f109;
    rsqrt.approx.ftz.f32 %f111, %f110;
    mul.ftz.f32 %f112, %f95, %f111;
    mul.ftz.f32 %f113, %f107, %f111;
    mul.ftz.f32 %f114, %f101, %f111;
    neg.ftz.f32 %f115, %f58;
    selp.f32 %f116, %f115, %f58, %p57;
    mul.ftz.f32 %f117, %f83, %f65;
    fma.rn.ftz.f32 %f118, %f83, %f117, %f62;
    neg.ftz.f32 %f119, %f83;
    st.param.f32 [result+0], %f4;
    st.param.f32 [result+4], %f69;
    st.param.f32 [result+8], %f76;
    st.param.f32 [result+12], %f79;
    st.param.f32 [result+16], %f82;
    st.param.f32 [result+20], %f58;
    st.param.f32 [result+24], %f83;
    st.param.f32 [result+28], %f55;
    st.param.u32 [result+32], %r84;
    st.param.f32 [result+36], %f70;
    st.param.f32 [result+40], %f72;
    st.param.f32 [result+44], %f88;
    st.param.f32 [result+48], %f112;
    st.param.f32 [result+52], %f113;
    st.param.f32 [result+56], %f114;
    st.param.f32 [result+60], %f116;
    st.param.f32 [result+64], %f86;
    st.param.f32 [result+68], %f118;
    st.param.f32 [result+72], %f119;
    ret;
}


COMPILE ERROR: Invalid PTX input: ptx2llvm-module-001: error: Failed to parse input PTX string
ptx2llvm-module-001, line 2; fatal   : Unsupported .target 'sm_87'
Cannot parse input PTX string

Steps to reproduce

Seems to be related to the newer Nvidia drivers/Cuda versions as it used to work before but here is the minimal steps to reproduce the issue:

  1. A clean installation of the latest Mitsuba following the build instructions (Release build)
  2. Adding the generated python folder as a python path variable
  3. Running any python script (including the test scripts on the website) using a CUDA variant
@njroussel
Copy link
Member

Hi @memamsaleh

This is a know issue. It has already been fixed in drjit.

There are two solutions:

  1. If you're using the PyPI package (pip install mitsuba) you will need to downgrade your GPU driver (anything prior to v535 seems to be fine).
  2. If you're building the project from source you will need the latest drjit commits. The current master branch of Mitsuba contains the necessary drjit fixes.

@memamsaleh
Copy link
Author

@njroussel Thanks for linking the drjit issue, for some reason the latest master branch of Mitsuba did not contain the fix so I had to manually add it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants