diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index d1b6897f287c8..087312f769ada 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -6608,14 +6608,22 @@ void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) { } void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { - assert(vector_len == AVX_128bit? VM_Version::supports_avx() : - vector_len == AVX_256bit? VM_Version::supports_avx2() : - 0, ""); + assert(UseAVX > 0 && (vector_len == Assembler::AVX_512bit || (!needs_evex(dst, nds, src) || VM_Version::supports_avx512vl())), ""); + assert(!needs_evex(dst, nds, src) || VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int24(0x0F, (0xC0 | encode), imm8); } +void Assembler::evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int24(0x3, (0xC0 | encode), imm8); +} + void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) { assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 45c24f8c83256..1fb5e01dba5e2 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -2167,6 +2167,7 @@ class Assembler : public AbstractAssembler { void palignr(XMMRegister dst, XMMRegister src, int imm8); void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len); void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8); + void evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len); void pblendw(XMMRegister dst, XMMRegister src, int imm8); void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index b5b65171e3277..96b9243225d7e 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -7112,6 +7112,133 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X } } +void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister xtmp, int origin, int vlen_enc) { + assert(vlen_enc == Assembler::AVX_256bit, ""); + if (origin < 16) { + // ALIGNR instruction concatenates the corresponding 128 bit + // lanes of two source vectors and then performs the right + // shift operation over intermediate value. Thus source vectors + // lanes needs to shuffled to a format consumable by ALIGNR. + // i.e. + // Initial source vectors + // 0...256 0...256 + // src1 = [v1 v2] and src2= [v3 v4] + // Formatted source vectors when SHIFT < 16 bytes + // 0...256 0...256 + // src1 = [v1 v2] and src2 = [v2 v3] + // Higher 128bit lane of src2 will not impact result, which will be + // sliced from lower and higher 128 bit lane of src1 and lower 128 bit + // lane of src2. + // i.e. + // Result lanes + // res[127:0] = {src1[255:128] , src1[127:0]} >> SHIFT + // res[255:128] = {src2[127:0] , src1[255:128]} >> SHIFT + vperm2i128(xtmp, src1, src2, 0x21); + vpalignr(dst, xtmp, src1, origin, Assembler::AVX_256bit); + } else { + assert(origin > 16 && origin <= 32, ""); + // Similarly, when SHIFT >= 16 bytes, lower 128bit lane of + // src1 will not impact result, which will be sliced from + // higher 128 bit lane of src1 and lower and upper 128 bit + // lanes of src2. + // Thus, two source vector should have following format + // 0...256 0...256 + // src1 = [v2 v3] and src2 = [v3 v4] + // Result lanes + // res[127:0] = {src2[127:0] , src1[255:127]} >> SHIFT + // res[255:128] = {src2[255:128] , src2[127:0]} >> SHIFT + vperm2i128(xtmp, src1, src2, 0x21); + vpalignr(dst, src2, xtmp, origin - 16, Assembler::AVX_256bit); + } +} + + +void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister xtmp, int origin, int vlen_enc) { + if (origin < 16) { + // Initial source vectors + // 0.........512 0.........512 + // src1 = [v1 v2 v3 v4] and src2 = [v5 v6 v7 v8] + // where v* represents 128 bit wide vector lanes. + // When SHIFT <= 16 result will be sliced out from src1 and + // lowest 128 bit vector lane + // of src2. + // ALIGNR will consider following source vector lanes pairs + // spread across two source vectors in order to compute 128 bit + // lanes of result vector. + // res[127:0] = {src1[255:128], src1[127:0]} + // res[255:128] = {src1[383:256], src1[255:128]} + // res[383:256] = {src1[511:384], src1[383:256]} + // res[511:384] = {src2[127:0], src1[511:384]} + // + // ALIGNR concatenates corresponding lanes across source vectors + // before right shifting the intermediate result. Therefore, source + // vector lanes should be shuffled to have following format + // src1 = {v1, v2, v3, v4} and src2 = {v2, v3, v4, v5} + // + // |-------------| + // |-----|--------| | + // alignr -> [v1 v2 v3 v4] [v2 v3 v4 v5] + // |_____|________| | + // |_____________| + evalignd(xtmp, src2, src1, 4, vlen_enc); + vpalignr(dst, xtmp, src1, origin, vlen_enc); + } else if (origin > 16 && origin < 32) { + // Similarly, for SHIFT between 16 and 32 bytes + // result will be sliced out of src1 and lower + // two 128 bit lanes of src2. + // i.e. + // res[127:0] = {src1[383:256], src1[255:128]} + // res[255:128] = {src1[511:384], src1[383:256]} + // res[383:256] = {src2[127:0], src1[511:384]} + // res[511:384] = {src2[255:128], src2[127:0]} + // Thus, source vector lanes should have following format. + // src1 = {v2, v3, v4, v5} and src2 = {v3, v4, v5, v6} + evalignd(xtmp, src2, src1, 4, vlen_enc); + evalignd(dst, src2, src1, 8, vlen_enc); + vpalignr(dst, dst, xtmp, origin - 16, vlen_enc); + } else if (origin > 32 && origin < 48) { + // For SHIFT between 32 and 48 bytes + // result will be sliced out of src1 and lower + // four 128 bit lanes of src2. + // i.e. + // res[127:0] = {src1[511:384], src1[383:255]} + // res[255:128] = {src2[127:0], src1[511:384]} + // res[383:256] = {src2[255:128], src2[127:0]} + // res[511:384] = {src2[383:256], src2[255:128]} + // Thus, source vector lanes should have following format. + // src1 = {v3, v4, v5, v6} and src2 = {v4, v5, v6, v7} + evalignd(xtmp, src2, src1, 8, vlen_enc); + evalignd(dst, src2, src1, 12, vlen_enc); + vpalignr(dst, dst, xtmp, origin - 32, vlen_enc); + } else { + // Finally, for SHIFT greater than 48 bytes + // result will be sliced out of upper 128 bit lane of src1 and + // src2. + // i.e. + // res[127:0] = {src2[127:0], src1[511:383]} + // res[255:128] = {src2[255:127], src2[127:0]} + // res[383:256] = {src2[383:256], src2[255:128]} + // res[511:384] = {src2[511:384], src2[383:256]} + // Thus, source vector lanes should have following format. + // src1 = {v4, v5, v6, v7} and src2 = {v5, v6, v7, v8} + assert(origin > 48 && origin < 64, ""); + evalignd(xtmp, src2, src1, 12, vlen_enc); + vpalignr(dst, src2, xtmp, origin - 48, vlen_enc); + } +} + +void C2_MacroAssembler::vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, + XMMRegister xtmp, int origin, int vlen_enc) { + if (VM_Version::supports_avx512vlbw()) { + vector_slice_64B_op(dst, src1, src2, xtmp, origin, vlen_enc); + } else { + assert(vlen_enc == Assembler::AVX_256bit, ""); + vector_slice_32B_op(dst, src1, src2, xtmp, origin, vlen_enc); + } +} + void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { switch(opcode) { case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break; diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index e1652213688b9..284a464ac5542 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -570,6 +570,12 @@ void select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc); + void vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc); + + void vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc); + + void vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc); + void evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc); void evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc); diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 2eb748e350c05..291558b6f4cf0 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1723,6 +1723,14 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { return false; // Implementation limitation } break; + case Op_VectorSlice: + if (UseAVX < 1 || size_in_bits < 128) { + return false; + } + if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { + return false; + } + break; case Op_VectorLoadShuffle: case Op_VectorRearrange: if(vlen == 2) { @@ -10759,6 +10767,70 @@ instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2) ins_pipe( pipe_slow ); %} +instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin) +%{ + predicate(Matcher::vector_length_in_bytes(n) == 16); + match(Set dst (VectorSlice (Binary src1 src2) origin)); + format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vpalignr($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $origin$$constant, vlen_enc); + %} + ins_pipe(pipe_slow); +%} + +instruct vector_slice_const_origin_GT16B_index16B_reg(vec dst, vec src1, vec src2, immI origin) +%{ + predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() == 16); + match(Set dst (VectorSlice (Binary src1 src2) origin)); + format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + // src1 = [v2, v1], src2 = [v4, v3] + // dst = [v3, v2] + __ vperm2i128($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0x21); + %} + ins_pipe(pipe_slow); +%} + +instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI origin, vec xtmp) +%{ + predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() != 16); + match(Set dst (VectorSlice (Binary src1 src2) origin)); + effect(TEMP xtmp); + format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc); + %} + ins_pipe(pipe_slow); +%} + +instruct vector_slice_const_origin_GT16B_index_multiple4_reg_evex(vec dst, vec src1, vec src2, immI origin) +%{ + predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) == 0); + match(Set dst (VectorSlice (Binary src1 src2) origin)); + format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + int normalized_origin = $origin$$constant >> 2; + __ evalignd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, normalized_origin, vlen_enc); + %} + ins_pipe(pipe_slow); +%} + +instruct vector_slice_const_origin_GT16B_reg_evex(vec dst, vec src1, vec src2, immI origin, vec xtmp) +%{ + predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) != 0); + match(Set dst (VectorSlice (Binary src1 src2) origin)); + effect(TEMP dst, TEMP xtmp); + format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %} + ins_encode %{ + int vlen_enc = vector_length_encoding(this); + __ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc); + %} + ins_pipe(pipe_slow); +%} instruct vector_sqrt_HF_reg(vec dst, vec src) %{ diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index b938d5b75608d..a58a95cfc5c0d 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4363,7 +4363,7 @@ bool MatchRule::is_vector() const { "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", "VectorCastB2X", "VectorCastS2X", "VectorCastI2X", "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F", - "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", + "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", "VectorSlice", "VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked", "FmaVD", "FmaVF", "FmaVHF", "PopCountVI", "PopCountVL", "PopulateIndex", "VectorLongToMask", "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV", diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 3d110c5706bd2..4600a9bafc0fc 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -1167,6 +1167,18 @@ class methodHandle; "Z") \ do_name(vector_test_name, "test") \ \ + do_intrinsic(_VectorSlice, jdk_internal_vm_vector_VectorSupport, vector_slice_name, vector_slice_sig, F_S) \ + do_signature(vector_slice_sig, "(I" \ + "Ljava/lang/Class;" \ + "Ljava/lang/Class;" \ + "I" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ + "Ljdk/internal/vm/vector/VectorSupport$VectorSliceOp;)" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;") \ + do_name(vector_slice_name, "sliceOp") \ + \ + \ do_intrinsic(_VectorBlend, jdk_internal_vm_vector_VectorSupport, vector_blend_name, vector_blend_sig, F_S) \ do_signature(vector_blend_sig, "(Ljava/lang/Class;" \ "Ljava/lang/Class;" \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index acc2896462740..794d6beb22d53 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -841,6 +841,7 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) { case vmIntrinsics::_VectorSelectFromTwoVectorOp: case vmIntrinsics::_VectorGatherOp: case vmIntrinsics::_VectorScatterOp: + case vmIntrinsics::_VectorSlice: case vmIntrinsics::_VectorReductionCoerced: case vmIntrinsics::_VectorTest: case vmIntrinsics::_VectorBlend: diff --git a/src/hotspot/share/opto/callGenerator.cpp b/src/hotspot/share/opto/callGenerator.cpp index e09d8cabe2c8f..2ab2120459dfc 100644 --- a/src/hotspot/share/opto/callGenerator.cpp +++ b/src/hotspot/share/opto/callGenerator.cpp @@ -441,6 +441,31 @@ CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* cal return cg; } +class LateInlineVectorCallGenerator : public LateInlineCallGenerator { + protected: + CallGenerator* _inline_cg; + + public: + LateInlineVectorCallGenerator(ciMethod* method, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) : + LateInlineCallGenerator(method, intrinsic_cg) , _inline_cg(inline_cg) {} + + CallGenerator* inline_cg2() const { return _inline_cg; } + bool inline_fallback(); + virtual bool is_vector_late_inline() const { return true; } +}; + +bool LateInlineVectorCallGenerator::inline_fallback() { + switch (method()->intrinsic_id()) { + case vmIntrinsics::_VectorSlice: return true; + default : return false; + } +} + +CallGenerator* CallGenerator::for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) { + return new LateInlineVectorCallGenerator(m, intrinsic_cg, inline_cg); +} + + // Allow inlining decisions to be delayed class LateInlineVirtualCallGenerator : public VirtualCallGenerator { private: @@ -673,6 +698,14 @@ void CallGenerator::do_late_inline_helper() { // Now perform the inlining using the synthesized JVMState JVMState* new_jvms = inline_cg()->generate(jvms); + // Attempt inlining fallback implementation in case of + // intrinsification failure. + if (new_jvms == nullptr && is_vector_late_inline()) { + LateInlineVectorCallGenerator* late_inline_vec_cg = static_cast(this); + if (late_inline_vec_cg->inline_fallback()) { + new_jvms = late_inline_vec_cg->inline_cg2()->generate(jvms); + } + } if (new_jvms == nullptr) return; // no change if (C->failing()) return; diff --git a/src/hotspot/share/opto/callGenerator.hpp b/src/hotspot/share/opto/callGenerator.hpp index 82b195e0c7603..ae2a40ae60d93 100644 --- a/src/hotspot/share/opto/callGenerator.hpp +++ b/src/hotspot/share/opto/callGenerator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -75,6 +75,7 @@ class CallGenerator : public ArenaObj { // same but for method handle calls virtual bool is_mh_late_inline() const { return false; } virtual bool is_string_late_inline() const { return false; } + virtual bool is_vector_late_inline() const { return false; } virtual bool is_boxing_late_inline() const { return false; } virtual bool is_vector_reboxing_late_inline() const { return false; } virtual bool is_virtual_late_inline() const { return false; } @@ -141,6 +142,7 @@ class CallGenerator : public ArenaObj { static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg); static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const); static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg); + static CallGenerator* for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg); static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg); static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg); static CallGenerator* for_late_inline_virtual(ciMethod* m, int vtable_index, float expected_uses); diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 587d5fad8f29e..13b37e0530c4e 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -521,6 +521,7 @@ macro(VectorRearrange) macro(VectorLoadMask) macro(VectorLoadShuffle) macro(VectorLoadConst) +macro(VectorSlice) macro(VectorStoreMask) macro(VectorReinterpret) macro(VectorCast) diff --git a/src/hotspot/share/opto/doCall.cpp b/src/hotspot/share/opto/doCall.cpp index ad7b64f93f5a0..2d998b7e6f128 100644 --- a/src/hotspot/share/opto/doCall.cpp +++ b/src/hotspot/share/opto/doCall.cpp @@ -164,7 +164,9 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool cg_intrinsic = cg; cg = nullptr; } else if (IncrementalInline && should_delay_vector_inlining(callee, jvms)) { - return CallGenerator::for_late_inline(callee, cg); + float expected_uses = jvms->method()->scale_count(site_count, prof_factor); + CallGenerator* inline_cg = CallGenerator::for_inline(callee, expected_uses); + return CallGenerator::for_vector_late_inline(callee, cg, inline_cg); } else { return cg; } diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 6c5efaafc21ab..7599225dccffd 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -773,6 +773,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { return inline_index_vector(); case vmIntrinsics::_IndexPartiallyInUpperRange: return inline_index_partially_in_upper_range(); + case vmIntrinsics::_VectorSlice: + return inline_vector_slice(); case vmIntrinsics::_getObjectSize: return inline_getObjectSize(); diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index fbc6007d4e195..4c2286e061d83 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -411,6 +411,7 @@ class LibraryCallKit : public GraphKit { bool inline_vector_convert(); bool inline_vector_extract(); bool inline_vector_insert(); + bool inline_vector_slice(); bool inline_vector_compress_expand(); bool inline_index_vector(); bool inline_index_partially_in_upper_range(); diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 5cb56019bc144..dde71f06dd24d 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2509,6 +2509,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { n->del_req(3); break; } + case Op_VectorSlice: case Op_VectorBlend: case Op_VectorInsert: { Node* pair = new BinaryNode(n->in(1), n->in(2)); diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 10430a09e7231..5adcfa01877c9 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1660,6 +1660,76 @@ bool LibraryCallKit::inline_vector_blend() { return true; } + +// public interface VectorSliceOp> { +// VectorPayload apply(int origin, V v1, V v2); +// } +// +// public static +// , +// E> +// VectorPayload sliceOp(int origin, +// Class vClass, Class eClass, int length, V v1, V v2, +// VectorSliceOp defaultImpl) +bool LibraryCallKit::inline_vector_slice() { + const TypeInt* origin = gvn().type(argument(0))->isa_int(); + const TypeInstPtr* vector_klass = gvn().type(argument(1))->isa_instptr(); + const TypeInstPtr* elem_klass = gvn().type(argument(2))->isa_instptr(); + const TypeInt* vlen = gvn().type(argument(3))->isa_int(); + + if (origin == nullptr || vector_klass == nullptr || elem_klass == nullptr || vlen == nullptr) { + return false; // dead code + } + if (vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr || !vlen->is_con()) { + log_if_needed(" ** missing constant: vclass=%s etype=%s vlen=%s", + NodeClassNames[argument(1)->Opcode()], + NodeClassNames[argument(2)->Opcode()], + NodeClassNames[argument(3)->Opcode()]); + return false; // not enough info for intrinsification + } + if (!is_klass_initialized(vector_klass)) { + log_if_needed(" ** klass argument not initialized"); + return false; + } + ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type(); + if (!elem_type->is_primitive_type()) { + log_if_needed(" ** not a primitive bt=%d", elem_type->basic_type()); + return false; // should be primitive type + } + + int num_elem = vlen->get_con(); + BasicType elem_bt = elem_type->basic_type(); + + if (!origin->is_con()) { + log_if_needed(" ** vector slice from non-constant index not supported"); + return false; + } + + if (!arch_supports_vector(Op_VectorSlice, num_elem, elem_bt, VecMaskNotUsed)) { + log_if_needed(" ** not supported: arity=2 op=slice vlen=%d etype=%s", + num_elem, type2name(elem_bt)); + return false; // not supported + } + + ciKlass* vbox_klass = vector_klass->const_oop()->as_instance()->java_lang_Class_klass(); + const TypeInstPtr* vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass); + + Node* v1 = unbox_vector(argument(4), vbox_type, elem_bt, num_elem); + Node* v2 = unbox_vector(argument(5), vbox_type, elem_bt, num_elem); + if (v1 == nullptr || v2 == nullptr) { + return false; // operand unboxing failed + } + + // Defining origin in terms of number of bytes to make it type agnostic value. + Node* origin_node = gvn().intcon(origin->get_con() * type2aelembytes(elem_bt)); + const TypeVect* vector_type = TypeVect::make(elem_bt, num_elem); + Node* operation = gvn().transform(new VectorSliceNode(v1, v2, origin_node, vector_type)); + Node* box = box_vector(operation, vbox_type, elem_bt, num_elem); + set_result(box); + C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt)))); + return true; +} + // public static // , // M extends VectorMask, diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index c126c91da1b3a..d04577e476165 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -2403,6 +2403,21 @@ Node* UMaxVNode::Identity(PhaseGVN* phase) { } return this; } + +Node* VectorSliceNode::Identity(PhaseGVN* phase) { + if (origin()->is_Con()) { + jint index = origin()->get_int(); + uint vlen = vect_type()->length_in_bytes(); + if (vlen == (uint)index) { + return vec2(); + } + if (index == 0) { + return vec1(); + } + } + return this; +} + #ifndef PRODUCT void VectorBoxAllocateNode::dump_spec(outputStream *st) const { CallStaticJavaNode::dump_spec(st); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 463680d0a52dd..27303bccc28d3 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1717,6 +1717,20 @@ class VectorTestNode : public CmpNode { } }; +class VectorSliceNode : public VectorNode { + public: + VectorSliceNode(Node* vec1, Node* vec2, Node* origin, const TypeVect* vt) + : VectorNode(vec1, vec2, origin, vt) { + } + + virtual int Opcode() const; + Node* vec1() const { return in(1); } + Node* vec2() const { return in(2); } + Node* origin() const { return in(3); } + virtual Node* Identity(PhaseGVN* phase); +}; + + class VectorBlendNode : public VectorNode { public: VectorBlendNode(Node* vec1, Node* vec2, Node* mask) diff --git a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java index d3705a279ca28..4cd1b204feb5a 100644 --- a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java +++ b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java @@ -758,6 +758,20 @@ long maskReductionCoerced(int oper, /* ============================================================================ */ + public interface VectorSliceOp, E> { + V apply(int origin, V v1, V v2); + } + + @IntrinsicCandidate + public static + , + E> + V sliceOp(int origin, Class vClass, Class eClass, int length, V v1, V v2, + VectorSliceOp defaultImpl) { + assert isNonCapturingLambda(defaultImpl) : defaultImpl; + return defaultImpl.apply(origin, v1, v2); + } + // Returns a string containing a list of CPU features VM detected. public static native String getCPUFeatures(); diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java index 08406fef518df..5c8ec8348864c 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java @@ -2304,18 +2304,24 @@ public final ByteVector blend(long e, public abstract ByteVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - ByteVector sliceTemplate(int origin, Vector v1) { + > + ByteVector sliceTemplate(int origin, V v1) { ByteVector that = (ByteVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - ByteVector iotaVector = (ByteVector) iotaShuffle().toBitsVector(); - ByteVector filter = broadcast((byte)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (ByteVector)VectorSupport.sliceOp(origin, getClass(), byte.class, length(), this, that, + (index, vec1, vec2) -> { + ByteVector iotaVector = (ByteVector) vec1.iotaShuffle().toBitsVector(); + ByteVector filter = vec1.broadcast((byte)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2342,11 +2348,16 @@ ByteVector slice(int origin, @ForceInline ByteVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - ByteVector iotaVector = (ByteVector) iotaShuffle().toBitsVector(); - ByteVector filter = broadcast((byte)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + ByteVector that = (ByteVector) vspecies().zero(); + return (ByteVector)VectorSupport.sliceOp(origin, getClass(), byte.class, length(), this, that, + (index, vec1, vec2) -> { + ByteVector iotaVector = (ByteVector) vec1.iotaShuffle().toBitsVector(); + ByteVector filter = vec1.broadcast((byte)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java index 786cd089ebecb..6237e977f042f 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java @@ -2158,18 +2158,25 @@ public final DoubleVector blend(long e, public abstract DoubleVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - DoubleVector sliceTemplate(int origin, Vector v1) { + > + DoubleVector sliceTemplate(int origin, V v1) { DoubleVector that = (DoubleVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - LongVector iotaVector = (LongVector) iotaShuffle().toBitsVector(); - LongVector filter = LongVector.broadcast((LongVector.LongSpecies) vspecies().asIntegral(), (long)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (DoubleVector)VectorSupport.sliceOp(origin, getClass(), double.class, length(), this, that, + (index, vec1, vec2) -> { + LongVector iotaVector = (LongVector) vec1.iotaShuffle().toBitsVector(); + LongVector filter = LongVector.broadcast((LongVector.LongSpecies) vec1.vspecies().asIntegral(), + (long)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2196,11 +2203,17 @@ DoubleVector slice(int origin, @ForceInline DoubleVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - LongVector iotaVector = (LongVector) iotaShuffle().toBitsVector(); - LongVector filter = LongVector.broadcast((LongVector.LongSpecies) vspecies().asIntegral(), (long)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + DoubleVector that = (DoubleVector) vspecies().zero(); + return (DoubleVector)VectorSupport.sliceOp(origin, getClass(), double.class, length(), this, that, + (index, vec1, vec2) -> { + LongVector iotaVector = (LongVector) vec1.iotaShuffle().toBitsVector(); + LongVector filter = LongVector.broadcast((LongVector.LongSpecies) vec1.vspecies().asIntegral(), + (long)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java index b481d5a51d740..d93091cbc5ed6 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java @@ -2170,18 +2170,25 @@ public final FloatVector blend(long e, public abstract FloatVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - FloatVector sliceTemplate(int origin, Vector v1) { + > + FloatVector sliceTemplate(int origin, V v1) { FloatVector that = (FloatVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - IntVector iotaVector = (IntVector) iotaShuffle().toBitsVector(); - IntVector filter = IntVector.broadcast((IntVector.IntSpecies) vspecies().asIntegral(), (int)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (FloatVector)VectorSupport.sliceOp(origin, getClass(), float.class, length(), this, that, + (index, vec1, vec2) -> { + IntVector iotaVector = (IntVector) vec1.iotaShuffle().toBitsVector(); + IntVector filter = IntVector.broadcast((IntVector.IntSpecies) vec1.vspecies().asIntegral(), + (int)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2208,11 +2215,17 @@ FloatVector slice(int origin, @ForceInline FloatVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - IntVector iotaVector = (IntVector) iotaShuffle().toBitsVector(); - IntVector filter = IntVector.broadcast((IntVector.IntSpecies) vspecies().asIntegral(), (int)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + FloatVector that = (FloatVector) vspecies().zero(); + return (FloatVector)VectorSupport.sliceOp(origin, getClass(), float.class, length(), this, that, + (index, vec1, vec2) -> { + IntVector iotaVector = (IntVector) vec1.iotaShuffle().toBitsVector(); + IntVector filter = IntVector.broadcast((IntVector.IntSpecies) vec1.vspecies().asIntegral(), + (int)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java index 43356b9ea6c9e..5192d8d476c3d 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java @@ -2289,18 +2289,24 @@ public final IntVector blend(long e, public abstract IntVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - IntVector sliceTemplate(int origin, Vector v1) { + > + IntVector sliceTemplate(int origin, V v1) { IntVector that = (IntVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - IntVector iotaVector = (IntVector) iotaShuffle().toBitsVector(); - IntVector filter = broadcast((int)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (IntVector)VectorSupport.sliceOp(origin, getClass(), int.class, length(), this, that, + (index, vec1, vec2) -> { + IntVector iotaVector = (IntVector) vec1.iotaShuffle().toBitsVector(); + IntVector filter = vec1.broadcast((int)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2327,11 +2333,16 @@ IntVector slice(int origin, @ForceInline IntVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - IntVector iotaVector = (IntVector) iotaShuffle().toBitsVector(); - IntVector filter = broadcast((int)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + IntVector that = (IntVector) vspecies().zero(); + return (IntVector)VectorSupport.sliceOp(origin, getClass(), int.class, length(), this, that, + (index, vec1, vec2) -> { + IntVector iotaVector = (IntVector) vec1.iotaShuffle().toBitsVector(); + IntVector filter = vec1.broadcast((int)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java index 8947343ff306b..ee8f92337eef0 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java @@ -2155,18 +2155,24 @@ public final LongVector blend(long e, public abstract LongVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - LongVector sliceTemplate(int origin, Vector v1) { + > + LongVector sliceTemplate(int origin, V v1) { LongVector that = (LongVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - LongVector iotaVector = (LongVector) iotaShuffle().toBitsVector(); - LongVector filter = broadcast((long)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (LongVector)VectorSupport.sliceOp(origin, getClass(), long.class, length(), this, that, + (index, vec1, vec2) -> { + LongVector iotaVector = (LongVector) vec1.iotaShuffle().toBitsVector(); + LongVector filter = vec1.broadcast((long)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2193,11 +2199,16 @@ LongVector slice(int origin, @ForceInline LongVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - LongVector iotaVector = (LongVector) iotaShuffle().toBitsVector(); - LongVector filter = broadcast((long)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + LongVector that = (LongVector) vspecies().zero(); + return (LongVector)VectorSupport.sliceOp(origin, getClass(), long.class, length(), this, that, + (index, vec1, vec2) -> { + LongVector iotaVector = (LongVector) vec1.iotaShuffle().toBitsVector(); + LongVector filter = vec1.broadcast((long)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java index e222c6d25f390..c35dfdae7ba5e 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java @@ -2305,18 +2305,24 @@ public final ShortVector blend(long e, public abstract ShortVector slice(int origin, Vector v1); + /*package-private*/ final @ForceInline - ShortVector sliceTemplate(int origin, Vector v1) { + > + ShortVector sliceTemplate(int origin, V v1) { ShortVector that = (ShortVector) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - ShortVector iotaVector = (ShortVector) iotaShuffle().toBitsVector(); - ShortVector filter = broadcast((short)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + return (ShortVector)VectorSupport.sliceOp(origin, getClass(), short.class, length(), this, that, + (index, vec1, vec2) -> { + ShortVector iotaVector = (ShortVector) vec1.iotaShuffle().toBitsVector(); + ShortVector filter = vec1.broadcast((short)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2343,11 +2349,16 @@ ShortVector slice(int origin, @ForceInline ShortVector sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - ShortVector iotaVector = (ShortVector) iotaShuffle().toBitsVector(); - ShortVector filter = broadcast((short)(length() - origin)); - VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); - AbstractShuffle iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + ShortVector that = (ShortVector) vspecies().zero(); + return (ShortVector)VectorSupport.sliceOp(origin, getClass(), short.class, length(), this, that, + (index, vec1, vec2) -> { + ShortVector iotaVector = (ShortVector) vec1.iotaShuffle().toBitsVector(); + ShortVector filter = vec1.broadcast((short)(vec1.length() - index)); + VectorMask blendMask = iotaVector.compare(VectorOperators.LT, filter); + AbstractShuffle iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template index f7d987fd280a0..44fd8e0f61dac 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template @@ -2717,23 +2717,30 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { public abstract $abstractvectortype$ slice(int origin, Vector<$Boxtype$> v1); + /*package-private*/ final @ForceInline - $abstractvectortype$ sliceTemplate(int origin, Vector<$Boxtype$> v1) { + > + $abstractvectortype$ sliceTemplate(int origin, V v1) { $abstractvectortype$ that = ($abstractvectortype$) v1; that.check(this); Objects.checkIndex(origin, length() + 1); - $Bitstype$Vector iotaVector = ($Bitstype$Vector) iotaShuffle().toBitsVector(); + return ($abstractvectortype$)VectorSupport.sliceOp(origin, getClass(), $type$.class, length(), this, that, + (index, vec1, vec2) -> { + $Bitstype$Vector iotaVector = ($Bitstype$Vector) vec1.iotaShuffle().toBitsVector(); #if[FP] - $Bitstype$Vector filter = $Bitstype$Vector.broadcast(($Bitstype$Vector.$Bitstype$Species) vspecies().asIntegral(), ($bitstype$)(length() - origin)); - VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); + $Bitstype$Vector filter = $Bitstype$Vector.broadcast(($Bitstype$Vector.$Bitstype$Species) vec1.vspecies().asIntegral(), + ($bitstype$)(vec1.length() - index)); + VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); #else[FP] - $abstractvectortype$ filter = broadcast(($type$)(length() - origin)); - VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter); + $abstractvectortype$ filter = vec1.broadcast(($type$)(vec1.length() - index)); + VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter); #end[FP] - AbstractShuffle<$Boxtype$> iota = iotaShuffle(origin, 1, true); - return that.rearrange(iota).blend(this.rearrange(iota), blendMask); + AbstractShuffle<$Boxtype$> iota = vec1.iotaShuffle(index, 1, true); + return vec2.rearrange(iota).blend(vec1.rearrange(iota), blendMask); + } + ); } /** @@ -2760,16 +2767,22 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { @ForceInline $abstractvectortype$ sliceTemplate(int origin) { Objects.checkIndex(origin, length() + 1); - $Bitstype$Vector iotaVector = ($Bitstype$Vector) iotaShuffle().toBitsVector(); + $abstractvectortype$ that = ($abstractvectortype$) vspecies().zero(); + return ($abstractvectortype$)VectorSupport.sliceOp(origin, getClass(), $type$.class, length(), this, that, + (index, vec1, vec2) -> { + $Bitstype$Vector iotaVector = ($Bitstype$Vector) vec1.iotaShuffle().toBitsVector(); #if[FP] - $Bitstype$Vector filter = $Bitstype$Vector.broadcast(($Bitstype$Vector.$Bitstype$Species) vspecies().asIntegral(), ($bitstype$)(length() - origin)); - VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vspecies()); + $Bitstype$Vector filter = $Bitstype$Vector.broadcast(($Bitstype$Vector.$Bitstype$Species) vec1.vspecies().asIntegral(), + ($bitstype$)(vec1.length() - index)); + VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter).cast(vec1.vspecies()); #else[FP] - $abstractvectortype$ filter = broadcast(($type$)(length() - origin)); - VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter); + $abstractvectortype$ filter = vec1.broadcast(($type$)(vec1.length() - index)); + VectorMask<$Boxtype$> blendMask = iotaVector.compare(VectorOperators.LT, filter); #end[FP] - AbstractShuffle<$Boxtype$> iota = iotaShuffle(origin, 1, true); - return vspecies().zero().blend(this.rearrange(iota), blendMask); + AbstractShuffle<$Boxtype$> iota = vec1.iotaShuffle(index, 1, true); + return vec2.blend(vec1.rearrange(iota), blendMask); + } + ); } /** diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 7fb1eeb800c91..baea4c274f981 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2240,6 +2240,26 @@ public class IRNode { machOnlyNameRegex(RISCV_VAND_NOTL_VX_MASKED, "vand_notL_vx_masked"); } + public static final String VECTOR_SLICE_B = VECTOR_PREFIX + "VECTOR_SLICE_B" + POSTFIX; + static { + vectorNode(VECTOR_SLICE_B, "VectorSlice", TYPE_BYTE); + } + + public static final String VECTOR_SLICE_S = VECTOR_PREFIX + "VECTOR_SLICE_S" + POSTFIX; + static { + vectorNode(VECTOR_SLICE_S, "VectorSlice", TYPE_SHORT); + } + + public static final String VECTOR_SLICE_I = VECTOR_PREFIX + "VECTOR_SLICE_I" + POSTFIX; + static { + vectorNode(VECTOR_SLICE_I, "VectorSlice", TYPE_INT); + } + + public static final String VECTOR_SLICE_L = VECTOR_PREFIX + "VECTOR_SLICE_L" + POSTFIX; + static { + vectorNode(VECTOR_SLICE_L, "VectorSlice", TYPE_LONG); + } + public static final String VECTOR_BLEND_B = VECTOR_PREFIX + "VECTOR_BLEND_B" + POSTFIX; static { vectorNode(VECTOR_BLEND_B, "VectorBlend", TYPE_BYTE); diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestSliceOptValueTransforms.java b/test/hotspot/jtreg/compiler/vectorapi/TestSliceOptValueTransforms.java new file mode 100644 index 0000000000000..f6e65149c60e0 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestSliceOptValueTransforms.java @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** +* @test +* @bug 8303762 +* @summary Optimize vector slice operation with constant index using VPALIGNR instruction +* @modules jdk.incubator.vector +* @library /test/lib / +* @run main/othervm compiler.vectorapi.TestSliceOptValueTransforms +*/ +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import static compiler.lib.generators.Generators.G; + +public class TestSliceOptValueTransforms { + public static final int SIZE = 1024; + + public static final VectorSpecies BSP = ByteVector.SPECIES_PREFERRED; + public static final VectorSpecies SSP = ShortVector.SPECIES_PREFERRED; + public static final VectorSpecies ISP = IntVector.SPECIES_PREFERRED; + public static final VectorSpecies LSP = LongVector.SPECIES_PREFERRED; + + public static byte [] bsrc1; + public static byte [] bsrc2; + public static byte [] bdst; + + public static short [] ssrc1; + public static short [] ssrc2; + public static short [] sdst; + + public static int [] isrc1; + public static int [] isrc2; + public static int [] idst; + + public static long [] lsrc1; + public static long [] lsrc2; + public static long [] ldst; + + public TestSliceOptValueTransforms() { + bsrc1 = new byte[SIZE]; + bsrc2 = new byte[SIZE]; + bdst = new byte[SIZE]; + + ssrc1 = new short[SIZE]; + ssrc2 = new short[SIZE]; + sdst = new short[SIZE]; + + isrc1 = new int[SIZE]; + isrc2 = new int[SIZE]; + idst = new int[SIZE]; + + lsrc1 = new long[SIZE]; + lsrc2 = new long[SIZE]; + ldst = new long[SIZE]; + + G.fill(G.ints(), isrc1); + G.fill(G.ints(), isrc2); + G.fill(G.longs(), lsrc1); + G.fill(G.longs(), lsrc2); + + for (int i = 0; i < SIZE; i++) { + bsrc1[i] = (byte)(isrc1[i]); + bsrc2[i] = (byte)(isrc2[i]); + + ssrc1[i] = (short)(isrc1[i]); + ssrc2[i] = (short)(isrc2[i]); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_B}, applyIfCPUFeatureAnd = {"avx2", "true"}) + public void testZeroSliceIndexByte() { + for (int i = 0; i < BSP.loopBound(SIZE); i += BSP.length()) { + ByteVector.fromArray(BSP, bsrc1, i) + .slice(0, ByteVector.fromArray(BSP, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_B}, applyIfCPUFeature = {"avx2", "true"}) + public void testMaxSliceIndexByte() { + for (int i = 0; i < BSP.loopBound(SIZE); i += BSP.length()) { + ByteVector.fromArray(BSP, bsrc1, i) + .slice(BSP.length(), ByteVector.fromArray(BSP, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Test + @IR(counts = {IRNode.VECTOR_SLICE_B, " >0 "}, applyIfCPUFeature = {"avx2", "true"}) + public void testConstantSliceIndexByte() { + for (int i = 0; i < BSP.loopBound(SIZE); i += BSP.length()) { + ByteVector.fromArray(BSP, bsrc1, i) + .slice(1, ByteVector.fromArray(BSP, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index16B_reg", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeatureAnd = {"avx512f", "false", "avx2", "true"}) + public void test16BSliceIndexByte() { + for (int i = 0; i < BSP.loopBound(SIZE); i += BSP.length()) { + ByteVector.fromArray(BSP, bsrc1, i) + .slice(16, ByteVector.fromArray(BSP, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index_multiple4_reg_evex", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeature = {"avx512vl", "true"}) + public void testMultipleOf4BSliceIndexByte() { + for (int i = 0; i < BSP.loopBound(SIZE); i += BSP.length()) { + ByteVector.fromArray(BSP, bsrc1, i) + .slice(8, ByteVector.fromArray(BSP, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_S}, applyIfCPUFeatureAnd = {"avx2", "true"}) + public void testZeroSliceIndexShort() { + for (int i = 0; i < SSP.loopBound(SIZE); i += SSP.length()) { + ShortVector.fromArray(SSP, ssrc1, i) + .slice(0, ShortVector.fromArray(SSP, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_S}, applyIfCPUFeature = {"avx2", "true"}) + public void testMaxSliceIndexShort() { + for (int i = 0; i < SSP.loopBound(SIZE); i += SSP.length()) { + ShortVector.fromArray(SSP, ssrc1, i) + .slice(SSP.length(), ShortVector.fromArray(SSP, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Test + @IR(counts = {IRNode.VECTOR_SLICE_S, " >0 "}, applyIfCPUFeature = {"avx2", "true"}) + public void testConstantSliceIndexShort() { + for (int i = 0; i < SSP.loopBound(SIZE); i += SSP.length()) { + ShortVector.fromArray(SSP, ssrc1, i) + .slice(1, ShortVector.fromArray(SSP, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index16B_reg", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeatureAnd = {"avx512f", "false", "avx2", "true"}) + public void test16BSliceIndexShort() { + for (int i = 0; i < SSP.loopBound(SIZE); i += SSP.length()) { + ShortVector.fromArray(SSP, ssrc1, i) + .slice(8, ShortVector.fromArray(SSP, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index_multiple4_reg_evex", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeature = {"avx512vl", "true"}) + public void testMultipleOf4BSliceIndexShort() { + for (int i = 0; i < SSP.loopBound(SIZE); i += SSP.length()) { + ShortVector.fromArray(SSP, ssrc1, i) + .slice(4, ShortVector.fromArray(SSP, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_I}, applyIfCPUFeatureAnd = {"avx2", "true"}) + public void testZeroSliceIndexInt() { + for (int i = 0; i < ISP.loopBound(SIZE); i += ISP.length()) { + IntVector.fromArray(ISP, isrc1, i) + .slice(0, IntVector.fromArray(ISP, isrc2, i)) + .intoArray(idst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_I}, applyIfCPUFeature = {"avx2", "true"}) + public void testMaxSliceIndexInt() { + for (int i = 0; i < ISP.loopBound(SIZE); i += ISP.length()) { + IntVector.fromArray(ISP, isrc1, i) + .slice(ISP.length(), IntVector.fromArray(ISP, isrc2, i)) + .intoArray(idst, i); + } + } + + @Test + @IR(counts = {IRNode.VECTOR_SLICE_I, " >0 "}, applyIfCPUFeature = {"avx2", "true"}) + public void testConstantSliceIndexInt() { + for (int i = 0; i < ISP.loopBound(SIZE); i += ISP.length()) { + IntVector.fromArray(ISP, isrc1, i) + .slice(1, IntVector.fromArray(ISP, isrc2, i)) + .intoArray(idst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index16B_reg", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeatureAnd = {"avx512f", "false", "avx2", "true"}) + public void test16BSliceIndexInt() { + for (int i = 0; i < ISP.loopBound(SIZE); i += ISP.length()) { + IntVector.fromArray(ISP, isrc1, i) + .slice(4, IntVector.fromArray(ISP, isrc2, i)) + .intoArray(idst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index_multiple4_reg_evex", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeature = {"avx512vl", "true"}) + public void testMultipleOf4BSliceIndexInt() { + for (int i = 0; i < ISP.loopBound(SIZE); i += ISP.length()) { + IntVector.fromArray(ISP, isrc1, i) + .slice(4, IntVector.fromArray(ISP, isrc2, i)) + .intoArray(idst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_L}, applyIfCPUFeatureAnd = {"avx2", "true"}) + public void testZeroSliceIndexLong() { + for (int i = 0; i < LSP.loopBound(SIZE); i += LSP.length()) { + LongVector.fromArray(LSP, lsrc1, i) + .slice(0, LongVector.fromArray(LSP, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Test + @IR(failOn = {IRNode.VECTOR_SLICE_L}, applyIfCPUFeature = {"avx2", "true"}) + public void testMaxSliceIndexLong() { + for (int i = 0; i < LSP.loopBound(SIZE); i += LSP.length()) { + LongVector.fromArray(LSP, lsrc1, i) + .slice(LSP.length(), LongVector.fromArray(LSP, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Test + @IR(counts = {IRNode.VECTOR_SLICE_L, " >0 "}, applyIfCPUFeature = {"avx2", "true"}) + public void testConstantSliceIndexLong() { + for (int i = 0; i < LSP.loopBound(SIZE); i += LSP.length()) { + LongVector.fromArray(LSP, lsrc1, i) + .slice(1, LongVector.fromArray(LSP, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index16B_reg", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeatureAnd = {"avx512f", "false", "avx2", "true"}) + public void test16BSliceIndexLong() { + for (int i = 0; i < LSP.loopBound(SIZE); i += LSP.length()) { + LongVector.fromArray(LSP, lsrc1, i) + .slice(2, LongVector.fromArray(LSP, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Test + @IR(counts = {"vector_slice_const_origin_GT16B_index_multiple4_reg_evex", " >0 "}, + phase = {CompilePhase.MATCHING}, applyIfCPUFeature = {"avx512vl", "true"}) + public void testMultipleOf4BSliceIndexLong() { + for (int i = 0; i < LSP.loopBound(SIZE); i += LSP.length()) { + LongVector.fromArray(LSP, lsrc1, i) + .slice(2, LongVector.fromArray(LSP, lsrc2, i)) + .intoArray(ldst, i); + } + } + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector"); + } +} diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorSliceBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorSliceBenchmark.java new file mode 100644 index 0000000000000..78dbe5d537ffb --- /dev/null +++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorSliceBenchmark.java @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.jdk.incubator.vector; + +import java.util.concurrent.TimeUnit; +import java.util.Random; +import jdk.incubator.vector.*; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Fork(jvmArgs = {"--add-modules=jdk.incubator.vector"}) +public class VectorSliceBenchmark { + @Param({"1024", "2047", "4096"}) + int size; + + byte [] bsrc1; + byte [] bsrc2; + byte [] bdst; + + short [] ssrc1; + short [] ssrc2; + short [] sdst; + + int [] isrc1; + int [] isrc2; + int [] idst; + + long [] lsrc1; + long [] lsrc2; + long [] ldst; + + static final VectorSpecies bspecies = ByteVector.SPECIES_PREFERRED; + static final VectorSpecies sspecies = ShortVector.SPECIES_PREFERRED; + static final VectorSpecies ispecies = IntVector.SPECIES_PREFERRED; + static final VectorSpecies lspecies = LongVector.SPECIES_PREFERRED; + + static final int B_SLICE_IDX1 = bspecies.length() / 2; + static final int B_SLICE_IDX2 = bspecies.length() / 4; + + static final int S_SLICE_IDX1 = sspecies.length() / 2; + static final int S_SLICE_IDX2 = sspecies.length() / 4; + + static final int I_SLICE_IDX1 = ispecies.length() / 2; + static final int I_SLICE_IDX2 = ispecies.length() / 4; + + static final int L_SLICE_IDX1 = lspecies.length() / 2; + static final int L_SLICE_IDX2 = lspecies.length() / 4; + + @Setup(Level.Trial) + public void BmSetup() { + Random r = new Random(2048); + + bsrc1 = new byte[size]; + bsrc2 = new byte[size]; + bdst = new byte[size]; + + ssrc1 = new short[size]; + ssrc2 = new short[size]; + sdst = new short[size]; + + isrc1 = new int[size]; + isrc2 = new int[size]; + idst = new int[size]; + + lsrc1 = new long[size]; + lsrc2 = new long[size]; + ldst = new long[size]; + + for (int i = 0; i < size; i++) { + bsrc1[i] = (byte)r.nextInt(size); + bsrc2[i] = (byte)r.nextInt(size); + + ssrc1[i] = (short)r.nextInt(size); + ssrc2[i] = (short)r.nextInt(size); + + isrc1[i] = r.nextInt(size); + isrc2[i] = r.nextInt(size); + + lsrc1[i] = r.nextLong(size); + lsrc2[i] = r.nextLong(size); + } + } + + @Benchmark + public void byteVectorSliceWithConstantIndex1() { + for (int i = 0; i < bspecies.loopBound(bdst.length); i += bspecies.length()) { + ByteVector.fromArray(bspecies, bsrc1, i) + .slice(B_SLICE_IDX1, ByteVector.fromArray(bspecies, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Benchmark + public void byteVectorSliceWithConstantIndex2() { + for (int i = 0; i < bspecies.loopBound(bdst.length); i += bspecies.length()) { + ByteVector.fromArray(bspecies, bsrc1, i) + .slice(B_SLICE_IDX2, ByteVector.fromArray(bspecies, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Benchmark + public void byteVectorSliceWithVariableIndex() { + for (int i = 0; i < bspecies.loopBound(bdst.length); i += bspecies.length()) { + ByteVector.fromArray(bspecies, bsrc1, i) + .slice(i & (bspecies.length() - 1), ByteVector.fromArray(bspecies, bsrc2, i)) + .intoArray(bdst, i); + } + } + + @Benchmark + public void shortVectorSliceWithConstantIndex1() { + for (int i = 0; i < sspecies.loopBound(sdst.length); i += bspecies.length()) { + ShortVector.fromArray(sspecies, ssrc1, i) + .slice(S_SLICE_IDX1, ShortVector.fromArray(sspecies, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Benchmark + public void shortVectorSliceWithConstantIndex2() { + for (int i = 0; i < sspecies.loopBound(sdst.length); i += bspecies.length()) { + ShortVector.fromArray(sspecies, ssrc1, i) + .slice(S_SLICE_IDX2, ShortVector.fromArray(sspecies, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Benchmark + public void shortVectorSliceWithVariableIndex() { + for (int i = 0; i < sspecies.loopBound(sdst.length); i += bspecies.length()) { + ShortVector.fromArray(sspecies, ssrc1, i) + .slice(i & (sspecies.length() - 1), ShortVector.fromArray(sspecies, ssrc2, i)) + .intoArray(sdst, i); + } + } + + @Benchmark + public void intVectorSliceWithConstantIndex1() { + for (int i = 0; i < ispecies.loopBound(idst.length); i += ispecies.length()) { + IntVector.fromArray(ispecies, isrc1, i) + .slice(I_SLICE_IDX1, IntVector.fromArray(ispecies, isrc2, i)) + .intoArray(idst, i); + } + } + + @Benchmark + public void intVectorSliceWithConstantIndex2() { + for (int i = 0; i < ispecies.loopBound(idst.length); i += ispecies.length()) { + IntVector.fromArray(ispecies, isrc1, i) + .slice(I_SLICE_IDX2, IntVector.fromArray(ispecies, isrc2, i)) + .intoArray(idst, i); + } + } + + @Benchmark + public void intVectorSliceWithVariableIndex() { + for (int i = 0; i < ispecies.loopBound(idst.length); i += ispecies.length()) { + IntVector.fromArray(ispecies, isrc1, i) + .slice(i & (ispecies.length() - 1), IntVector.fromArray(ispecies, isrc2, i)) + .intoArray(idst, i); + } + } + + @Benchmark + public void longVectorSliceWithConstantIndex1() { + for (int i = 0; i < lspecies.loopBound(ldst.length); i += lspecies.length()) { + LongVector.fromArray(lspecies, lsrc1, i) + .slice(L_SLICE_IDX1, LongVector.fromArray(lspecies, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Benchmark + public void longVectorSliceWithConstantIndex2() { + for (int i = 0; i < lspecies.loopBound(ldst.length); i += lspecies.length()) { + LongVector.fromArray(lspecies, lsrc1, i) + .slice(L_SLICE_IDX2, LongVector.fromArray(lspecies, lsrc2, i)) + .intoArray(ldst, i); + } + } + + @Benchmark + public void longVectorSliceWithVariableIndex() { + for (int i = 0; i < lspecies.loopBound(ldst.length); i += lspecies.length()) { + LongVector.fromArray(lspecies, lsrc1, i) + .slice(i & (lspecies.length() - 1), LongVector.fromArray(lspecies, lsrc2, i)) + .intoArray(ldst, i); + } + } +}