diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 83308682835394..5cd3a3f1c32e66 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21906,6 +21906,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { IRBuilder<> Builder(SI); + auto Mask = SVI->getShuffleMask(); auto *ShuffleVTy = cast(SVI->getType()); // Given SVI : , then VTy : auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), @@ -21917,11 +21918,35 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + unsigned Index; + // If the segment store only has one active lane (i.e. the interleave is + // just a spread shuffle), we can use a strided store instead. This will + // be equally fast, and create less vector register pressure. + if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && + isSpreadMask(Mask, Factor, Index)) { + unsigned ScalarSizeInBytes = ShuffleVTy->getScalarSizeInBits() / 8; + Value *Data = SVI->getOperand(0); + auto *DataVTy = cast(Data->getType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); + Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); + Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); + Value *VL = Builder.getInt32(VTy->getNumElements()); + + CallInst *CI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_store, + {Data->getType(), BasePtr->getType(), Stride->getType()}, + {Data, BasePtr, Stride, Mask, VL}); + CI->addParamAttr( + 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); + + return true; + } + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, SI->getPointerOperandType(), XLenTy}); - auto Mask = SVI->getShuffleMask(); SmallVector Ops; for (unsigned i = 0; i < Factor; i++) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 5649ee20a47092..8833634be1a0ed 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1285,17 +1285,55 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { ret void } -; TODO: This should be a strided store -define void @store_factor4_one_active_storeback(ptr %ptr, <4 x i32> %v) { -; CHECK-LABEL: store_factor4_one_active_storeback: +define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) { +; CHECK-LABEL: store_factor4_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmv.v.v v10, v9 -; CHECK-NEXT: vmv.v.v v11, v9 -; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: vsse32.v v8, (a0), a1 ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> store <16 x i32> %v0, ptr %ptr ret void } + +define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) { +; CHECK-LABEL: store_factor4_one_active_idx1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> + store <16 x i32> %v0, ptr %ptr + ret void +} + +define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) { +; CHECK-LABEL: store_factor4_one_active_fullwidth: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret + %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> + store <16 x i32> %v0, ptr %ptr + ret void +} + +; TODO: This could be a vslidedown followed by a strided store +define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) { +; CHECK-LABEL: store_factor4_one_active_slidedown: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: vmv.v.v v11, v10 +; CHECK-NEXT: vmv.v.v v12, v10 +; CHECK-NEXT: vsseg4e32.v v9, (a0) +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> + store <16 x i32> %v0, ptr %ptr + ret void +}