NonZero prevents values from being const-propagated properly #51346

Amanieu · 2018-06-04T14:49:54Z

Found while tracking down the root cause of the slowdown in #51340. This is the minimized code which reproduces the performance issue (Playground link).

#![crate_type = "rlib"]

use std::num::NonZeroUsize;

unsafe fn repeat(c: bool) -> Option<NonZeroUsize> {
    if c {
        None
    } else {
        Some(NonZeroUsize::new_unchecked(8))
    }
}

pub unsafe fn calculate_layout(c: bool) -> usize {
    match repeat(c) {
        Some(x) => 8 - x.get(),
        None => std::hint::unreachable_unchecked(),
    }
}

This code produces the following output:

// ASM
playground::calculate_layout:
	mov	eax, edi
	shl	rax, 3
	ret

// LLVM
define i64 @_ZN10playground16calculate_layout17h33252013a8d7ec56E(i1 zeroext %c) unnamed_addr #0 {
start:
  %0 = select i1 %c, i64 8, i64 0
  ret i64 %0
}

Note that in practice, it is impossible for this function to return anything other than 0 (8 - 8) due to the call to unreachable_unchecked.

The version using usize does not suffer from this (Playground link):

#![crate_type = "rlib"]

unsafe fn repeat(c: bool) -> Option<usize> {
    if c {
        None
    } else {
        Some(8)
    }
}

pub unsafe fn calculate_layout(c: bool) -> usize {
    match repeat(c) {
        Some(x) => 8 - x,
        None => std::hint::unreachable_unchecked(),
    }
}

The new code produces the following output:

// ASM
playground::calculate_layout:
	xor	eax, eax
	ret

// LLVM
define i64 @_ZN10playground16calculate_layout17h33252013a8d7ec56E(i1 zeroext %c) unnamed_addr #0 {
start:
  ret i64 0
}

cc @gnzlbg @rkruppe

The text was updated successfully, but these errors were encountered:

hanna-kruppe · 2018-06-04T15:16:20Z

Looking through -print-after-all output, the following jumps out:

*** IR Dump After Value Propagation ***
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #0 {
start:
  %spec.select.i = select i1 %c, i64 0, i64 8
  br i1 %c, label %bb2, label %bb4

bb2:                                              ; preds = %start
  unreachable

bb4:                                              ; preds = %start
  %0 = sub i64 8, %spec.select.i
  %1 = insertvalue { i64, i1 } undef, i64 %0, 0
  %2 = insertvalue { i64, i1 } %1, i1 false, 1
  %3 = extractvalue { i64, i1 } %2, 1
  br i1 %3, label %panic, label %bb6, !prof !0

bb6:                                              ; preds = %bb4
  %4 = extractvalue { i64, i1 } %2, 0
  ret i64 %4

panic:                                            ; preds = %bb4
  call ... ; elided
  unreachable
}
*** IR Dump After Simplify the CFG ***
; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #0 {
start:
  %spec.select.i = select i1 %c, i64 0, i64 8
  %0 = sub i64 8, %spec.select.i
  %1 = insertvalue { i64, i1 } undef, i64 %0, 0
  %2 = insertvalue { i64, i1 } %1, i1 false, 1
  %3 = extractvalue { i64, i1 } %2, 1
  br i1 %3, label %panic, label %bb6, !prof !0

bb6:                                              ; preds = %start
  %4 = extractvalue { i64, i1 } %2, 0
  ret i64 %4

panic:                                            ; preds = %start
  call ... ; elided
  unreachable
}

SimplifyCFG is removing br i1 %c, <bb with unreachable>, <bb with calculations> in favor of falling through to <bb with calculation>, which loses the information that %c being true is UB, which could have been used to simplify the select at the start. Indeed manually adding an assume(not %c) makes it optimize to ret i64 0 (this appears to be the doing of instcombine, by leaning on the AssumptionCache). Of course, we do want to get rid of that branch, and we don't really want to carry around an assume forever, we'd just like some more optimization to happen to the select before the UB is exploited in another way.

hanna-kruppe · 2018-06-04T15:49:04Z

In the usize case, the hard work (noticing that the subtraction always results in 0) is done by interprocedural constant propagation. Since Option<usize> = { i64, i64 }, repeat unconditionally (after optimizations) puts 8 into the return value's payload, which can then be propagated before any inlining and regardless of the complexities related to the discriminant. So it only accidentially side-steps the problem of "if %c then UB" not being properly exploited due to using a different representation.

Before/after IR with module-level cruft removed:

*** IR Dump After Infer set function attributes ***
; Function Attrs: inlinehint noreturn uwtable
define internal void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E() unnamed_addr #0 {
start:
  unreachable
}

; Function Attrs: uwtable
define internal { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c) unnamed_addr #1 {
start:
  br i1 %c, label %bb1, label %bb2

bb1:                                              ; preds = %start
  br label %bb3

bb2:                                              ; preds = %start
  br label %bb3

bb3:                                              ; preds = %bb2, %bb1
  %_0.sroa.0.0 = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
  %0 = insertvalue { i64, i64 } undef, i64 %_0.sroa.0.0, 0
  %1 = insertvalue { i64, i64 } %0, i64 8, 1
  ret { i64, i64 } %1
}

; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #1 {
start:
  %0 = call { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c)
  %.fca.0.extract = extractvalue { i64, i64 } %0, 0
  %.fca.1.extract = extractvalue { i64, i64 } %0, 1
  %switch = icmp ult i64 %.fca.0.extract, 1
  br i1 %switch, label %bb2, label %bb4

bb2:                                              ; preds = %start
  call void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E()
  unreachable

bb4:                                              ; preds = %start
  %1 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 8, i64 %.fca.1.extract)
  %2 = extractvalue { i64, i1 } %1, 0
  %3 = extractvalue { i64, i1 } %1, 1
  br i1 %3, label %panic, label %bb5, !prof !0

bb5:                                              ; preds = %bb4
  ret i64 %2

panic:                                            ; preds = %bb4
  call void ... ; elided
  unreachable
}

*** IR Dump After Interprocedural Sparse Conditional Constant Propagation ***

; Function Attrs: inlinehint noreturn uwtable
define internal void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E() unnamed_addr #0 {
start:
  unreachable
}

; Function Attrs: uwtable
define internal { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c) unnamed_addr #1 {
start:
  br i1 %c, label %bb1, label %bb2

bb1:                                              ; preds = %start
  br label %bb3

bb2:                                              ; preds = %start
  br label %bb3

bb3:                                              ; preds = %bb2, %bb1
  %_0.sroa.0.0 = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
  %0 = insertvalue { i64, i64 } undef, i64 %_0.sroa.0.0, 0
  %1 = insertvalue { i64, i64 } %0, i64 8, 1
  ret { i64, i64 } %1
}

; Function Attrs: uwtable
define i64 @_ZN13nonzero_usize16calculate_layout17h044f84d640378d0bE(i1 zeroext %c) unnamed_addr #1 {
start:
  %0 = call { i64, i64 } @_ZN13nonzero_usize6repeat17hff27c5667304426aE(i1 zeroext %c)
  %.fca.0.extract = extractvalue { i64, i64 } %0, 0
  %switch = icmp ult i64 %.fca.0.extract, 1
  br i1 %switch, label %bb2, label %bb4

bb2:                                              ; preds = %start
  call void @_ZN4core4hint21unreachable_unchecked17h5c82d720186d4847E()
  unreachable

bb4:                                              ; preds = %start
  %1 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 8, i64 8)
  %2 = extractvalue { i64, i1 } %1, 0
  %3 = extractvalue { i64, i1 } %1, 1
  br i1 %3, label %panic, label %bb5, !prof !0

bb5:                                              ; preds = %bb4
  ret i64 %2

panic:                                            ; preds = %bb4
  call void ... ; elided
  unreachable
}

Amanieu · 2018-06-04T16:46:54Z

@rkruppe Look at this version which is closer to the original code and produces the following LLVM IR for playground::new:

define i64 @_ZN10playground3new17hc1cbbc83e9256887E(i64 %c) unnamed_addr #2 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 8) #8
  %1 = extractvalue { i64, i1 } %0, 1
  %2 = extractvalue { i64, i1 } %0, 0
  %spec.select.i.i = select i1 %1, i64 0, i64 8
  br i1 %1, label %bb4.i1, label %bb4.i

bb4.i:                                            ; preds = %start
  %3 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 16) #8
  %4 = extractvalue { i64, i1 } %3, 1
  %5 = extractvalue { i64, i1 } %3, 0
  %spec.select.i16.i = select i1 %4, i64 0, i64 8
  br i1 %4, label %bb4.i1, label %bb11.i

bb11.i:                                           ; preds = %bb4.i
  %6 = icmp ult i64 %spec.select.i16.i, %spec.select.i.i
  %_0.0.sroa.speculated.i.i.i.i = select i1 %6, i64 %spec.select.i.i, i64 %spec.select.i16.i
  %7 = add i64 %2, -1
  %8 = add i64 %7, %spec.select.i16.i
  %9 = sub nsw i64 0, %spec.select.i16.i
  %10 = and i64 %8, %9
  %11 = sub i64 %10, %2
  %12 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %2, i64 %11) #8
  %13 = extractvalue { i64, i1 } %12, 0
  %14 = extractvalue { i64, i1 } %12, 1
  br i1 %14, label %bb4.i1, label %bb11.i.i

bb11.i.i:                                         ; preds = %bb11.i
  %15 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %13, i64 %5) #8
  %16 = extractvalue { i64, i1 } %15, 1
  br i1 %16, label %bb4.i1, label %bb21.i.i

bb21.i.i:                                         ; preds = %bb11.i.i
  %17 = extractvalue { i64, i1 } %15, 0
  %18 = add nuw nsw i64 %_0.0.sroa.speculated.i.i.i.i, 15
  %19 = and i64 %18, %_0.0.sroa.speculated.i.i.i.i
  %20 = icmp ne i64 %19, 0
  %21 = icmp eq i64 %_0.0.sroa.speculated.i.i.i.i, 0
  %or.cond.i.not.i.i.i = or i1 %21, %20
  %22 = sub nsw i64 0, %_0.0.sroa.speculated.i.i.i.i
  %23 = icmp ugt i64 %17, %22
  %24 = or i1 %or.cond.i.not.i.i.i, %23
  br i1 %24, label %bb4.i1, label %"_ZN47_$LT$core..result..Result$LT$T$C$$u20$E$GT$$GT$6unwrap17h2368a06f0a7bf993E.exit"

bb4.i1:                                           ; preds = %bb21.i.i, %bb4.i, %bb11.i.i, %bb11.i, %start
; call core::result::unwrap_failed
  tail call fastcc void @_ZN4core6result13unwrap_failed17hde364ddca953ee8aE(), !noalias !6
  unreachable

"_ZN47_$LT$core..result..Result$LT$T$C$$u20$E$GT$$GT$6unwrap17h2368a06f0a7bf993E.exit": ; preds = %bb21.i.i
  ret i64 %13
}

These lines in particular highlight the problem:

  %spec.select.i.i = select i1 %1, i64 0, i64 8
  br i1 %1, label %bb4.i1, label %bb4.i

[...]

  %spec.select.i16.i = select i1 %4, i64 0, i64 8
  br i1 %4, label %bb4.i1, label %bb11.i

In %bb4.i, %spec.select.i.i can only have the value 8. The same applies to %bb11.i and %spec.select.i16.i. The missed constant propagation here hurts code generation for the rest of the function.

anp · 2018-06-04T16:56:23Z

@rust-lang/wg-codegen

hanna-kruppe · 2018-06-04T17:57:36Z

@Amanieu Interesting, that one can't be handled by exploiting UB (but we should still track that other problem, it'll affect users of unchecked_unwrap style functions). It's too big to properly analyze why it's not optimized well, but based on what you've identified as the core problem I came up with this IR test case:

define i64 @foo(i64 %c) {
bb1:
  %mul = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 8)
  %overflow = extractvalue { i64, i1 } %mul, 1
  %select = select i1 %overflow, i64 0, i64 8
  br i1 %overflow, label %abort, label %bb2

bb2:
  call void @dummy(i64 %select)
  ret i64 %select

abort:
  call void @abort()
  unreachable
}
declare void @abort()
declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
declare void @dummy(i64)

opt -O2 does nothing to this, and neither does NewGVN for that matter.

Removing the call to use_dummy makes instcombine optimize the return statement to ret i64 %n. I assume this is because of some "has only one use" check in instcombine. I haven't bothered to search for a simple expression that uses %select twice which instcombine can't turn into a single use.

Curiously, it also gets optimized if the select is sunk into bb2. I assume that's because of this cheapskate test. Which makes me wonder why the definition isn't sunk into bb2 by any pass in the -O2 pipeline.

hanna-kruppe · 2018-06-04T18:03:52Z

Turns out there is a code sinking pass and opt -sink -instcombine optimizes the above test case as well as anyone could hope. It also does wonders on the IR from @Amanieu last "more like the original" code. Now to find out why it isn't enabled in the default pipeline...

hanna-kruppe · 2018-06-04T20:18:46Z

Talked about this with @sunfishcode on IRC. Seems like there's no real reason for -sink to not be in the optimization pipeline(s) by default. However, apparently MachineSink was believed to cover many such cases (clearly not true here) and sinking could sometimes be a pessimization because it can lengthen the live ranges of the operands of the instruction that's moved. I'm unsure what to do about this issue. Ideally this optimization could be applied without moving code, but writing that pass & getting it upstream & enabled by default seems like a lot of work.

gnzlbg · 2018-06-04T20:47:11Z

At least there should be an LLVM bug tracking this as well.

Amanieu · 2018-06-04T23:53:33Z

I'm somewhat worried about the amount of code that could be affected by this considering how often Option, Result and ? are used in Rust.

Amanieu · 2018-07-06T14:17:42Z

@rkruppe Could we try enabling the sinking pass in rustc so that we can see what the performance impact looks like (positive or negative)? This seems like something that would benefit Rust much more than C/C++.

hanna-kruppe · 2018-07-06T14:38:19Z

AFAIK we still don't have infrastructure for good measuring the performance impact of a change on the run time of anything other than rustc itself. Without that, I'm not confident in our ability to check for regressions and quantify them.

This seems like something that would benefit Rust much more than C/C++

Possibly, but there's more cases like that (e.g. range checks) and we've generally been hesitant to deviate too far from the established pass pipelines in those cases too.

nagisa · 2018-07-11T04:58:26Z

We did at some point make up a custom LLVM pass pipeline, so experimenting with it is welcome.

…

On Fri, Jul 6, 2018, 17:38 Robin Kruppe ***@***.***> wrote: AFAIK we still don't have infrastructure for good measuring the performance impact of a change on the run time of anything other than rustc itself. Without that, I'm not confident in our ability to check for regressions and quantify them. This seems like something that would benefit Rust much more than C/C++ Possibly, but there's more cases like that (e.g. range checks) and we've generally been hesitant to deviate too far from the established pass pipelines in those cases too. — You are receiving this because you are on a team that was mentioned. Reply to this email directly, view it on GitHub <#51346 (comment)>, or mute the thread <https://github.com/notifications/unsubscribe-auth/AApc0hejmgQNcUVi3hhKadd064mUOEGBks5uD3Z-gaJpZM4UZOz8> .

LingMan · 2021-09-21T09:07:07Z

Three years later

The example given in the first post optimizes as expected since Rust 1.52 (Godbolt), however the example from this comment still doesn't optimize away the %select (Godbolt).

…ock. This patch tries to sink instructions when they are only used in a successor block. This is a further enhancement patch based on Anna's commit: D109700, which allows sinking an instruction having multiple uses in a single user. In this patch, sink instructions with multiple users in a single successor block will be supported. It could fix a known issue from rust: rust-lang/rust#51346 (comment) Reviewed By: nikic, reames Differential Revision: https://reviews.llvm.org/D121585

wweiandrew · 2022-03-18T04:17:30Z

Three years later

The example given in the first post optimizes as expected since Rust 1.52 (Godbolt), however the example from this comment still doesn't optimize away the %select (Godbolt).

Fix in llvm/llvm-project@0af3e6a

wweiandrew · 2022-03-23T09:19:30Z

@Amanieu could you help to check whether this issue can be closed?

Amanieu · 2022-04-05T14:22:31Z

I can confirm that this fixes the problem.

@nikic Do you think this is worth backporting or should we just wait for LLVM 15?

…ock. This patch tries to sink instructions when they are only used in a successor block. This is a further enhancement patch based on Anna's commit: D109700, which allows sinking an instruction having multiple uses in a single user. In this patch, sink instructions with multiple users in a single successor block will be supported. It could fix a known issue from rust: rust-lang/rust#51346 (comment) Reviewed By: nikic, reames Differential Revision: https://reviews.llvm.org/D121585

Amanieu · 2022-10-20T11:34:27Z

We've upgraded to LLVM 15, which includes the fix.

Amanieu mentioned this issue Jun 4, 2018

Optimize layout calculations in HashMap #51340

Merged

estebank added the A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. label Jan 8, 2019

mtak- mentioned this issue Apr 17, 2019

Missed NonZero optimization opportunity #60044

Open

jonas-schievink added I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. labels Apr 17, 2019

JohnTitor added the C-bug Category: This is a bug. label Nov 10, 2019

Amanieu closed this as completed Oct 20, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

NonZero prevents values from being const-propagated properly #51346

NonZero prevents values from being const-propagated properly #51346

Amanieu commented Jun 4, 2018 •

edited

Loading

hanna-kruppe commented Jun 4, 2018 •

edited

Loading

hanna-kruppe commented Jun 4, 2018

Amanieu commented Jun 4, 2018

anp commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

gnzlbg commented Jun 4, 2018

Amanieu commented Jun 4, 2018

Amanieu commented Jul 6, 2018

hanna-kruppe commented Jul 6, 2018

nagisa commented Jul 11, 2018 via email

LingMan commented Sep 21, 2021

wweiandrew commented Mar 18, 2022

wweiandrew commented Mar 23, 2022

Amanieu commented Apr 5, 2022

Amanieu commented Oct 20, 2022

NonZero prevents values from being const-propagated properly #51346

NonZero prevents values from being const-propagated properly #51346

Comments

Amanieu commented Jun 4, 2018 • edited Loading

hanna-kruppe commented Jun 4, 2018 • edited Loading

hanna-kruppe commented Jun 4, 2018

Amanieu commented Jun 4, 2018

anp commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

hanna-kruppe commented Jun 4, 2018

gnzlbg commented Jun 4, 2018

Amanieu commented Jun 4, 2018

Amanieu commented Jul 6, 2018

hanna-kruppe commented Jul 6, 2018

nagisa commented Jul 11, 2018 via email

LingMan commented Sep 21, 2021

wweiandrew commented Mar 18, 2022

wweiandrew commented Mar 23, 2022

Amanieu commented Apr 5, 2022

Amanieu commented Oct 20, 2022

Amanieu commented Jun 4, 2018 •

edited

Loading

hanna-kruppe commented Jun 4, 2018 •

edited

Loading