From c7ba5a9affbcabb0d05301e5417c203274667572 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 19 Mar 2022 13:31:05 +0000 Subject: [PATCH] [X86][SSE] Add initial support for extracting non-constant bool vector elements We can use MOVMSK+TEST/BT to extract individual bool elements even if the index isn't constant This relies on combineBitcastvxi1 so some AVX512 cases still aren't optimized as they avoid MOVMSK usage. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 +-- .../test/CodeGen/X86/avx512-insert-extract.ll | 20 +--- llvm/test/CodeGen/X86/movmsk-cmp.ll | 113 ++++++++---------- 3 files changed, 62 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f3f78a624d69d..0ba6fa26c7437 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43333,29 +43333,32 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, // but not // i1 = extract_vector_elt t0:1, Constant:i64<2> // since the latter would need its own MOVMSK. - if (CIdx && SrcVT.getScalarType() == MVT::i1) { + if (SrcVT.getScalarType() == MVT::i1) { + bool IsVar = !CIdx; SmallVector BoolExtracts; unsigned ResNo = InputVector.getResNo(); - auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) { + auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa(Use->getOperand(1)) && Use->getOperand(0).getResNo() == ResNo && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); + IsVar |= !isa(Use->getOperand(1)); return true; } return false; }; + // TODO: Can we drop the oneuse check for constant extracts? if (all_of(InputVector->uses(), IsBoolExtract) && - BoolExtracts.size() > 1) { + (IsVar || BoolExtracts.size() > 1)) { EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { for (SDNode *Use : BoolExtracts) { // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask - unsigned MaskIdx = Use->getConstantOperandVal(1); - APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx); - SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT); + // Mask = 1 << MaskIdx + SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8); + SDValue MaskBit = DAG.getConstant(1, dl, BCVT); + SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx); SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); DCI.CombineTo(Use, Res); diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 78bb6d5f1a635..5e0318b1984e4 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1633,23 +1633,13 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> % define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v32i1: ; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-32, %rsp -; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: ## kill: def $edi killed $edi def $rdi ; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa %ymm0, (%rsp) -; KNL-NEXT: andl $31, %edi -; KNL-NEXT: movzbl (%rsp,%rdi), %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp +; KNL-NEXT: vpmovmskb %ymm0, %ecx +; KNL-NEXT: notl %ecx +; KNL-NEXT: xorl %eax, %eax +; KNL-NEXT: btl %edi, %ecx +; KNL-NEXT: setb %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 955266a782c40..70a086e96e6e4 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -4264,33 +4264,26 @@ define i1 @movmsk_or_v2f64(<2 x double> %x, <2 x double> %y) { define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) { ; SSE-LABEL: movmsk_v16i8_var: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $15, %edi -; SSE-NEXT: movb -24(%rsp,%rdi), %al +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: btl %edi, %eax +; SSE-NEXT: setb %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v16i8_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $15, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi), %al +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v16i8_var: ; KNL: # %bb.0: -; KNL-NEXT: # kill: def $edi killed $edi def $rdi ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp) -; KNL-NEXT: andl $15, %edi -; KNL-NEXT: movb -24(%rsp,%rdi), %al -; KNL-NEXT: vzeroupper +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: btl %edi, %eax +; KNL-NEXT: setb %al ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v16i8_var: @@ -4310,20 +4303,20 @@ define i1 @movmsk_v16i8_var(<16 x i8> %x, <16 x i8> %y, i32 %z) { define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) { ; SSE-LABEL: movmsk_v8i16_var: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $7, %edi -; SSE-NEXT: movb -24(%rsp,%rdi,2), %al +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: btl %edi, %eax +; SSE-NEXT: setb %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v8i16_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $7, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi,2), %al +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v8i16_var: @@ -4357,20 +4350,18 @@ define i1 @movmsk_v8i16_var(<8 x i16> %x, <8 x i16> %y, i32 %z) { define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) { ; SSE-LABEL: movmsk_v4i32_var: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $3, %edi -; SSE-NEXT: movb -24(%rsp,%rdi,4), %al +; SSE-NEXT: movmskps %xmm1, %eax +; SSE-NEXT: btl %edi, %eax +; SSE-NEXT: setb %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v4i32_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $3, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al +; AVX1OR2-NEXT: vmovmskps %xmm0, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v4i32_var: @@ -4403,37 +4394,31 @@ define i1 @movmsk_v4i32_var(<4 x i32> %x, <4 x i32> %y, i32 %z) { define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; SSE2-LABEL: movmsk_v2i64_var: ; SSE2: # %bb.0: -; SSE2-NEXT: # kill: def $edi killed $edi def $rdi ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: andl $1, %edi -; SSE2-NEXT: movb -24(%rsp,%rdi,8), %al +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: btl %edi, %eax +; SSE2-NEXT: setb %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: movmsk_v2i64_var: ; SSE41: # %bb.0: -; SSE41-NEXT: # kill: def $edi killed $edi def $rdi ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: andl $1, %edi -; SSE41-NEXT: movb -24(%rsp,%rdi,8), %al +; SSE41-NEXT: movmskpd %xmm0, %eax +; SSE41-NEXT: xorl $3, %eax +; SSE41-NEXT: btl %edi, %eax +; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v2i64_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $1, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax +; AVX1OR2-NEXT: xorl $3, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v2i64_var: @@ -4466,23 +4451,21 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) { ; SSE-LABEL: movmsk_v4f32_var: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: cmpeqps %xmm1, %xmm2 ; SSE-NEXT: cmpunordps %xmm1, %xmm0 ; SSE-NEXT: orps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $3, %edi -; SSE-NEXT: movb -24(%rsp,%rdi,4), %al +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: btl %edi, %eax +; SSE-NEXT: setb %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v4f32_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vcmpeq_uqps %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $3, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi,4), %al +; AVX1OR2-NEXT: vmovmskps %xmm0, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v4f32_var: @@ -4515,20 +4498,18 @@ define i1 @movmsk_v4f32_var(<4 x float> %x, <4 x float> %y, i32 %z) { define i1 @movmsk_v2f64_var(<2 x double> %x, <2 x double> %y, i32 %z) { ; SSE-LABEL: movmsk_v2f64_var: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: cmplepd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andl $1, %edi -; SSE-NEXT: movb -24(%rsp,%rdi,8), %al +; SSE-NEXT: movmskpd %xmm1, %eax +; SSE-NEXT: btl %edi, %eax +; SSE-NEXT: setb %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v2f64_var: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vmovapd %xmm0, -{{[0-9]+}}(%rsp) -; AVX1OR2-NEXT: andl $1, %edi -; AVX1OR2-NEXT: movb -24(%rsp,%rdi,8), %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax +; AVX1OR2-NEXT: btl %edi, %eax +; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v2f64_var: