Skip to content

Commit

Permalink
[X86][SSE] Add initial support for extracting non-constant bool vecto…
Browse files Browse the repository at this point in the history
…r elements

We can use MOVMSK+TEST/BT to extract individual bool elements even if the index isn't constant

This relies on combineBitcastvxi1 so some AVX512 cases still aren't optimized as they avoid MOVMSK usage.
  • Loading branch information
RKSimon committed Mar 19, 2022
1 parent abb9cbb commit c7ba5a9
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 88 deletions.
17 changes: 10 additions & 7 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43333,29 +43333,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// but not
// i1 = extract_vector_elt t0:1, Constant:i64<2>
// since the latter would need its own MOVMSK.
if (CIdx && SrcVT.getScalarType() == MVT::i1) {
if (SrcVT.getScalarType() == MVT::i1) {
bool IsVar = !CIdx;
SmallVector<SDNode *, 16> BoolExtracts;
unsigned ResNo = InputVector.getResNo();
auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Use->getOperand(1)) &&
Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
return true;
}
return false;
};
// TODO: Can we drop the oneuse check for constant extracts?
if (all_of(InputVector->uses(), IsBoolExtract) &&
BoolExtracts.size() > 1) {
(IsVar || BoolExtracts.size() > 1)) {
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
for (SDNode *Use : BoolExtracts) {
// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
unsigned MaskIdx = Use->getConstantOperandVal(1);
APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
// Mask = 1 << MaskIdx
SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
DCI.CombineTo(Use, Res);
Expand Down
20 changes: 5 additions & 15 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1633,23 +1633,13 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v32i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzbl (%rsp,%rdi), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vpmovmskb %ymm0, %ecx
; KNL-NEXT: notl %ecx
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: btl %edi, %ecx
; KNL-NEXT: setb %al
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
Expand Down
113 changes: 47 additions & 66 deletions llvm/test/CodeGen/X86/movmsk-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4264,33 +4264,26 @@ define i1 @movmsk_or_v2f64(<2 x double> %x, <2 x double> %y) {
define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
; SSE-LABEL: movmsk_v16i8_var:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $15, %edi
; SSE-NEXT: movb -24(%rsp,%rdi), %al
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: btl %edi, %eax
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v16i8_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $15, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi), %al
; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v16i8_var:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $edi killed $edi def $rdi
; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $15, %edi
; KNL-NEXT: movb -24(%rsp,%rdi), %al
; KNL-NEXT: vzeroupper
; KNL-NEXT: vpmovmskb %xmm0, %eax
; KNL-NEXT: btl %edi, %eax
; KNL-NEXT: setb %al
; KNL-NEXT: retq
;
; SKX-LABEL: movmsk_v16i8_var:
Expand All @@ -4310,20 +4303,20 @@ define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) {
define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
; SSE-LABEL: movmsk_v8i16_var:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $7, %edi
; SSE-NEXT: movb -24(%rsp,%rdi,2), %al
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: btl %edi, %eax
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v8i16_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $7, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi,2), %al
; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v8i16_var:
Expand Down Expand Up @@ -4357,20 +4350,18 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) {
define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
; SSE-LABEL: movmsk_v4i32_var:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movb -24(%rsp,%rdi,4), %al
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: btl %edi, %eax
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v4i32_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $3, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al
; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v4i32_var:
Expand Down Expand Up @@ -4403,37 +4394,31 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) {
define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
; SSE2-LABEL: movmsk_v2i64_var:
; SSE2: # %bb.0:
; SSE2-NEXT: # kill: def $edi killed $edi def $rdi
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $1, %edi
; SSE2-NEXT: movb -24(%rsp,%rdi,8), %al
; SSE2-NEXT: movmskpd %xmm1, %eax
; SSE2-NEXT: xorl $3, %eax
; SSE2-NEXT: btl %edi, %eax
; SSE2-NEXT: setb %al
; SSE2-NEXT: retq
;
; SSE41-LABEL: movmsk_v2i64_var:
; SSE41: # %bb.0:
; SSE41-NEXT: # kill: def $edi killed $edi def $rdi
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSE41-NEXT: andl $1, %edi
; SSE41-NEXT: movb -24(%rsp,%rdi,8), %al
; SSE41-NEXT: movmskpd %xmm0, %eax
; SSE41-NEXT: xorl $3, %eax
; SSE41-NEXT: btl %edi, %eax
; SSE41-NEXT: setb %al
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v2i64_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $1, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al
; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax
; AVX1OR2-NEXT: xorl $3, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v2i64_var:
Expand Down Expand Up @@ -4466,23 +4451,21 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
; SSE-LABEL: movmsk_v4f32_var:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: cmpeqps %xmm1, %xmm2
; SSE-NEXT: cmpunordps %xmm1, %xmm0
; SSE-NEXT: orps %xmm2, %xmm0
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movb -24(%rsp,%rdi,4), %al
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: btl %edi, %eax
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v4f32_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vcmpeq_uqps %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $3, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al
; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v4f32_var:
Expand Down Expand Up @@ -4515,20 +4498,18 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) {
define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) {
; SSE-LABEL: movmsk_v2f64_var:
; SSE: # %bb.0:
; SSE-NEXT: # kill: def $edi killed $edi def $rdi
; SSE-NEXT: cmplepd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movb -24(%rsp,%rdi,8), %al
; SSE-NEXT: movmskpd %xmm1, %eax
; SSE-NEXT: btl %edi, %eax
; SSE-NEXT: setb %al
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: movmsk_v2f64_var:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi
; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
; AVX1OR2-NEXT: vmovapd %xmm0, -{{[0-9]+}}(%rsp)
; AVX1OR2-NEXT: andl $1, %edi
; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al
; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax
; AVX1OR2-NEXT: btl %edi, %eax
; AVX1OR2-NEXT: setb %al
; AVX1OR2-NEXT: retq
;
; KNL-LABEL: movmsk_v2f64_var:
Expand Down

0 comments on commit c7ba5a9

Please sign in to comment.