diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 76e3c92ddc261..152dcb6b13c68 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2014, 2024, Red Hat, Inc. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -2296,6 +2296,26 @@ bool Matcher::match_rule_supported(int opcode) { return false; } break; + case Op_FmaHF: + // UseFMA flag also needs to be checked along with FEAT_FP16 + if (!UseFMA || !is_feat_fp16_supported()) { + return false; + } + break; + case Op_AddHF: + case Op_SubHF: + case Op_MulHF: + case Op_DivHF: + case Op_MinHF: + case Op_MaxHF: + case Op_SqrtHF: + // Half-precision floating point scalar operations require FEAT_FP16 + // to be available. FEAT_FP16 is enabled if both "fphp" and "asimdhp" + // features are supported. + if (!is_feat_fp16_supported()) { + return false; + } + break; } return true; // Per default match rules are supported. @@ -4599,6 +4619,15 @@ operand immF0() interface(CONST_INTER); %} +// Half Float (FP16) Immediate +operand immH() +%{ + match(ConH); + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // operand immFPacked() %{ @@ -6942,6 +6971,21 @@ instruct loadConD(vRegD dst, immD con) %{ ins_pipe(fp_load_constant_d); %} +// Load Half Float Constant +// The "ldr" instruction loads a 32-bit word from the constant pool into a +// 32-bit register but only the bottom half will be populated and the top +// 16 bits are zero. +instruct loadConH(vRegF dst, immH con) %{ + match(Set dst con); + format %{ + "ldrs $dst, [$constantaddress]\t# load from constant table: half float=$con\n\t" + %} + ins_encode %{ + __ ldrs(as_FloatRegister($dst$$reg), $constantaddress($con)); + %} + ins_pipe(fp_load_constant_s); +%} + // Store Instructions // Store Byte @@ -13606,6 +13650,17 @@ instruct bits_reverse_L(iRegLNoSp dst, iRegL src) // ============================================================================ // Floating Point Arithmetic Instructions +instruct addHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (AddHF src1 src2)); + format %{ "faddh $dst, $src1, $src2" %} + ins_encode %{ + __ faddh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_dop_reg_reg_s); +%} + instruct addF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ match(Set dst (AddF src1 src2)); @@ -13636,6 +13691,17 @@ instruct addD_reg_reg(vRegD dst, vRegD src1, vRegD src2) %{ ins_pipe(fp_dop_reg_reg_d); %} +instruct subHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (SubHF src1 src2)); + format %{ "fsubh $dst, $src1, $src2" %} + ins_encode %{ + __ fsubh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_dop_reg_reg_s); +%} + instruct subF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ match(Set dst (SubF src1 src2)); @@ -13666,6 +13732,17 @@ instruct subD_reg_reg(vRegD dst, vRegD src1, vRegD src2) %{ ins_pipe(fp_dop_reg_reg_d); %} +instruct mulHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (MulHF src1 src2)); + format %{ "fmulh $dst, $src1, $src2" %} + ins_encode %{ + __ fmulh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_dop_reg_reg_s); +%} + instruct mulF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ match(Set dst (MulF src1 src2)); @@ -13696,6 +13773,20 @@ instruct mulD_reg_reg(vRegD dst, vRegD src1, vRegD src2) %{ ins_pipe(fp_dop_reg_reg_d); %} +// src1 * src2 + src3 (half-precision float) +instruct maddHF_reg_reg(vRegF dst, vRegF src1, vRegF src2, vRegF src3) %{ + match(Set dst (FmaHF src3 (Binary src1 src2))); + format %{ "fmaddh $dst, $src1, $src2, $src3" %} + ins_encode %{ + assert(UseFMA, "Needs FMA instructions support."); + __ fmaddh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister, + $src3$$FloatRegister); + %} + ins_pipe(pipe_class_default); +%} + // src1 * src2 + src3 instruct maddF_reg_reg(vRegF dst, vRegF src1, vRegF src2, vRegF src3) %{ match(Set dst (FmaF src3 (Binary src1 src2))); @@ -13837,6 +13928,29 @@ instruct mnsubD_reg_reg(vRegD dst, vRegD src1, vRegD src2, vRegD src3, immD0 zer ins_pipe(pipe_class_default); %} +// Math.max(HH)H (half-precision float) +instruct maxHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (MaxHF src1 src2)); + format %{ "fmaxh $dst, $src1, $src2" %} + ins_encode %{ + __ fmaxh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_dop_reg_reg_s); +%} + +// Math.min(HH)H (half-precision float) +instruct minHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (MinHF src1 src2)); + format %{ "fminh $dst, $src1, $src2" %} + ins_encode %{ + __ fminh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_dop_reg_reg_s); +%} // Math.max(FF)F instruct maxF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ @@ -13894,6 +14008,16 @@ instruct minD_reg_reg(vRegD dst, vRegD src1, vRegD src2) %{ ins_pipe(fp_dop_reg_reg_d); %} +instruct divHF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ + match(Set dst (DivHF src1 src2)); + format %{ "fdivh $dst, $src1, $src2" %} + ins_encode %{ + __ fdivh($dst$$FloatRegister, + $src1$$FloatRegister, + $src2$$FloatRegister); + %} + ins_pipe(fp_div_s); +%} instruct divF_reg_reg(vRegF dst, vRegF src1, vRegF src2) %{ match(Set dst (DivF src1 src2)); @@ -14067,6 +14191,16 @@ instruct sqrtF_reg(vRegF dst, vRegF src) %{ ins_pipe(fp_div_d); %} +instruct sqrtHF_reg(vRegF dst, vRegF src) %{ + match(Set dst (SqrtHF src)); + format %{ "fsqrth $dst, $src" %} + ins_encode %{ + __ fsqrth($dst$$FloatRegister, + $src$$FloatRegister); + %} + ins_pipe(fp_div_s); +%} + // Math.rint, floor, ceil instruct roundD_reg(vRegD dst, vRegD src, immI rmode) %{ match(Set dst (RoundDoubleMode src rmode)); @@ -17116,6 +17250,64 @@ instruct expandBitsL_memcon(iRegINoSp dst, memory8 mem, immL mask, ins_pipe(pipe_slow); %} +//----------------------------- Reinterpret ---------------------------------- +// Reinterpret a half-precision float value in a floating point register to a general purpose register +instruct reinterpretHF2S(iRegINoSp dst, vRegF src) %{ + match(Set dst (ReinterpretHF2S src)); + format %{ "reinterpretHF2S $dst, $src" %} + ins_encode %{ + __ smov($dst$$Register, $src$$FloatRegister, __ H, 0); + %} + ins_pipe(pipe_slow); +%} + +// Reinterpret a half-precision float value in a general purpose register to a floating point register +instruct reinterpretS2HF(vRegF dst, iRegINoSp src) %{ + match(Set dst (ReinterpretS2HF src)); + format %{ "reinterpretS2HF $dst, $src" %} + ins_encode %{ + __ mov($dst$$FloatRegister, __ H, 0, $src$$Register); + %} + ins_pipe(pipe_slow); +%} + +// Without this optimization, ReinterpretS2HF (ConvF2HF src) would result in the following +// instructions (the first two are for ConvF2HF and the last instruction is for ReinterpretS2HF) - +// fcvt $tmp1_fpr, $src_fpr // Convert float to half-precision float +// mov $tmp2_gpr, $tmp1_fpr // Move half-precision float in FPR to a GPR +// mov $dst_fpr, $tmp2_gpr // Move the result from a GPR to an FPR +// The move from FPR to GPR in ConvF2HF and the move from GPR to FPR in ReinterpretS2HF +// can be omitted in this pattern, resulting in - +// fcvt $dst, $src // Convert float to half-precision float +instruct convF2HFAndS2HF(vRegF dst, vRegF src) +%{ + match(Set dst (ReinterpretS2HF (ConvF2HF src))); + format %{ "convF2HFAndS2HF $dst, $src" %} + ins_encode %{ + __ fcvtsh($dst$$FloatRegister, $src$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// Without this optimization, ConvHF2F (ReinterpretHF2S src) would result in the following +// instructions (the first one is for ReinterpretHF2S and the last two are for ConvHF2F) - +// mov $tmp1_gpr, $src_fpr // Move the half-precision float from an FPR to a GPR +// mov $tmp2_fpr, $tmp1_gpr // Move the same value from GPR to an FPR +// fcvt $dst_fpr, $tmp2_fpr // Convert the half-precision float to 32-bit float +// The move from FPR to GPR in ReinterpretHF2S and the move from GPR to FPR in ConvHF2F +// can be omitted as the input (src) is already in an FPR required for the fcvths instruction +// resulting in - +// fcvt $dst, $src // Convert half-precision float to a 32-bit float +instruct convHF2SAndHF2F(vRegF dst, vRegF src) +%{ + match(Set dst (ConvHF2F (ReinterpretHF2S src))); + format %{ "convHF2SAndHF2F $dst, $src" %} + ins_encode %{ + __ fcvths($dst$$FloatRegister, $src$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + // ============================================================================ // This name is KNOWN by the ADLC and cannot be changed. // The ADLC forces a 'TypeRawPtr::BOTTOM' output type diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 3db7d30884429..5c02e30963eaa 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -2032,6 +2032,8 @@ void mvnw(Register Rd, Register Rm, INSN(fsqrtd, 0b01, 0b000011); INSN(fcvtd, 0b01, 0b000100); // Double-precision to single-precision + INSN(fsqrth, 0b11, 0b000011); // Half-precision sqrt + private: void _fcvt_narrow_extend(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, bool do_extend) { @@ -2059,37 +2061,68 @@ void mvnw(Register Rd, Register Rm, #undef INSN // Floating-point data-processing (2 source) - void data_processing(unsigned op31, unsigned type, unsigned opcode, + void data_processing(unsigned op31, unsigned type, unsigned opcode, unsigned op21, FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) { starti; f(op31, 31, 29); f(0b11110, 28, 24); - f(type, 23, 22), f(1, 21), f(opcode, 15, 10); + f(type, 23, 22), f(op21, 21), f(opcode, 15, 10); rf(Vm, 16), rf(Vn, 5), rf(Vd, 0); } -#define INSN(NAME, op31, type, opcode) \ +#define INSN(NAME, op31, type, opcode, op21) \ void NAME(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) { \ - data_processing(op31, type, opcode, Vd, Vn, Vm); \ - } - - INSN(fabds, 0b011, 0b10, 0b110101); - INSN(fmuls, 0b000, 0b00, 0b000010); - INSN(fdivs, 0b000, 0b00, 0b000110); - INSN(fadds, 0b000, 0b00, 0b001010); - INSN(fsubs, 0b000, 0b00, 0b001110); - INSN(fmaxs, 0b000, 0b00, 0b010010); - INSN(fmins, 0b000, 0b00, 0b010110); - INSN(fnmuls, 0b000, 0b00, 0b100010); - - INSN(fabdd, 0b011, 0b11, 0b110101); - INSN(fmuld, 0b000, 0b01, 0b000010); - INSN(fdivd, 0b000, 0b01, 0b000110); - INSN(faddd, 0b000, 0b01, 0b001010); - INSN(fsubd, 0b000, 0b01, 0b001110); - INSN(fmaxd, 0b000, 0b01, 0b010010); - INSN(fmind, 0b000, 0b01, 0b010110); - INSN(fnmuld, 0b000, 0b01, 0b100010); + data_processing(op31, type, opcode, op21, Vd, Vn, Vm); \ + } + + INSN(fmuls, 0b000, 0b00, 0b000010, 0b1); + INSN(fdivs, 0b000, 0b00, 0b000110, 0b1); + INSN(fadds, 0b000, 0b00, 0b001010, 0b1); + INSN(fsubs, 0b000, 0b00, 0b001110, 0b1); + INSN(fmaxs, 0b000, 0b00, 0b010010, 0b1); + INSN(fmins, 0b000, 0b00, 0b010110, 0b1); + INSN(fnmuls, 0b000, 0b00, 0b100010, 0b1); + + INSN(fmuld, 0b000, 0b01, 0b000010, 0b1); + INSN(fdivd, 0b000, 0b01, 0b000110, 0b1); + INSN(faddd, 0b000, 0b01, 0b001010, 0b1); + INSN(fsubd, 0b000, 0b01, 0b001110, 0b1); + INSN(fmaxd, 0b000, 0b01, 0b010010, 0b1); + INSN(fmind, 0b000, 0b01, 0b010110, 0b1); + INSN(fnmuld, 0b000, 0b01, 0b100010, 0b1); + + // Half-precision floating-point instructions + INSN(fmulh, 0b000, 0b11, 0b000010, 0b1); + INSN(fdivh, 0b000, 0b11, 0b000110, 0b1); + INSN(faddh, 0b000, 0b11, 0b001010, 0b1); + INSN(fsubh, 0b000, 0b11, 0b001110, 0b1); + INSN(fmaxh, 0b000, 0b11, 0b010010, 0b1); + INSN(fminh, 0b000, 0b11, 0b010110, 0b1); + INSN(fnmulh, 0b000, 0b11, 0b100010, 0b1); +#undef INSN + +// Advanced SIMD scalar three same +#define INSN(NAME, U, size, opcode) \ + void NAME(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) { \ + starti; \ + f(0b01, 31, 30), f(U, 29), f(0b11110, 28, 24), f(size, 23, 22), f(1, 21); \ + rf(Vm, 16), f(opcode, 15, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); \ + } + + INSN(fabds, 0b1, 0b10, 0b11010); // Floating-point Absolute Difference (single-precision) + INSN(fabdd, 0b1, 0b11, 0b11010); // Floating-point Absolute Difference (double-precision) + +#undef INSN + +// Advanced SIMD scalar three same FP16 +#define INSN(NAME, U, a, opcode) \ + void NAME(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) { \ + starti; \ + f(0b01, 31, 30), f(U, 29), f(0b11110, 28, 24), f(a, 23), f(0b10, 22, 21); \ + rf(Vm, 16), f(0b00, 15, 14), f(opcode, 13, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); \ + } + + INSN(fabdh, 0b1, 0b1, 0b010); // Floating-point Absolute Difference (half-precision float) #undef INSN @@ -2120,6 +2153,7 @@ void mvnw(Register Rd, Register Rm, INSN(fnmaddd, 0b000, 0b01, 1, 0); INSN(fnmsub, 0b000, 0b01, 1, 1); + INSN(fmaddh, 0b000, 0b11, 0, 0); // half-precision fused multiply-add (scalar) #undef INSN // Floating-point conditional select diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index a6cd055775870..0fbc2ef141e8b 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -200,4 +200,8 @@ return false; } + // Is FEAT_FP16 supported for this CPU? + static bool is_feat_fp16_supported() { + return (VM_Version::supports_fphp() && VM_Version::supports_asimdhp()); + } #endif // CPU_AARCH64_MATCHER_AARCH64_HPP diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp index 04cf9c9c2a07c..373f8da540589 100644 --- a/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -125,6 +125,8 @@ enum Ampere_CPU_Model { decl(SHA2, sha256, 6) \ decl(CRC32, crc32, 7) \ decl(LSE, lse, 8) \ + decl(FPHP, fphp, 9) \ + decl(ASIMDHP, asimdhp, 10) \ decl(DCPOP, dcpop, 16) \ decl(SHA3, sha3, 17) \ decl(SHA512, sha512, 21) \ diff --git a/src/hotspot/os_cpu/linux_aarch64/vm_version_linux_aarch64.cpp b/src/hotspot/os_cpu/linux_aarch64/vm_version_linux_aarch64.cpp index dabc69403f3d5..9725c6cd6c073 100644 --- a/src/hotspot/os_cpu/linux_aarch64/vm_version_linux_aarch64.cpp +++ b/src/hotspot/os_cpu/linux_aarch64/vm_version_linux_aarch64.cpp @@ -75,6 +75,14 @@ #define HWCAP_PACA (1 << 30) #endif +#ifndef HWCAP_FPHP +#define HWCAP_FPHP (1<<9) +#endif + +#ifndef HWCAP_ASIMDHP +#define HWCAP_ASIMDHP (1<<10) +#endif + #ifndef HWCAP2_SVE2 #define HWCAP2_SVE2 (1 << 1) #endif @@ -119,6 +127,8 @@ void VM_Version::get_os_cpu_info() { static_assert(CPU_SHA512 == HWCAP_SHA512, "Flag CPU_SHA512 must follow Linux HWCAP"); static_assert(CPU_SVE == HWCAP_SVE, "Flag CPU_SVE must follow Linux HWCAP"); static_assert(CPU_PACA == HWCAP_PACA, "Flag CPU_PACA must follow Linux HWCAP"); + static_assert(CPU_FPHP == HWCAP_FPHP, "Flag CPU_FPHP must follow Linux HWCAP"); + static_assert(CPU_ASIMDHP == HWCAP_ASIMDHP, "Flag CPU_ASIMDHP must follow Linux HWCAP"); _features = auxv & ( HWCAP_FP | HWCAP_ASIMD | @@ -133,7 +143,9 @@ void VM_Version::get_os_cpu_info() { HWCAP_SHA3 | HWCAP_SHA512 | HWCAP_SVE | - HWCAP_PACA); + HWCAP_PACA | + HWCAP_FPHP | + HWCAP_ASIMDHP); if (auxv2 & HWCAP2_SVE2) _features |= CPU_SVE2; if (auxv2 & HWCAP2_SVEBITPERM) _features |= CPU_SVEBITPERM; diff --git a/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/aarch64/AArch64.java b/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/aarch64/AArch64.java index 2a8959e5d7fe9..dbd83b314c9eb 100644 --- a/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/aarch64/AArch64.java +++ b/src/jdk.internal.vm.ci/share/classes/jdk/vm/ci/aarch64/AArch64.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -183,6 +183,8 @@ public enum CPUFeature implements CPUFeatureName { SVEBITPERM, SVE2, A53MAC, + FPHP, + ASIMDHP, } private final EnumSet features; diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index 92868e783dcfe..866fb7f3e5884 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -14,10 +14,9 @@ immediates16 \ = [0x1, 0x38, 0x7e, 0xff, 0x1fc, 0x1ff, 0x3f0, - 0x7e0, 0xfc0, 0x1f80, 0x3ff0, 0x7e00, 0x7e00, - 0x8000, 0x81ff, 0xc1ff, 0xc003, 0xc7ff, 0xdfff, - 0xe03f, 0xe10f, 0xe1ff, 0xf801, 0xfc00, 0xfc07, - 0xff03, 0xfffe] + 0x7e0, 0xfc0, 0x1f80, 0x3ff0, 0x7e00, 0x8000, + 0x81ff, 0xc1ff, 0xc003, 0xc7ff, 0xdfff, 0xe03f, + 0xe1ff, 0xf801, 0xfc00, 0xfc07, 0xff03, 0xfffe] immediates32 \ = [0x1, 0x3f, 0x1f0, 0x7e0, @@ -1065,7 +1064,7 @@ class FloatInstruction(Instruction): def aname(self): if (self._name in ["fcvtsh", "fcvths"]): return self._name[:len(self._name)-2] - elif (self._name.endswith("s") | self._name.endswith("d")): + elif (self._name.endswith("h") | self._name.endswith("s") | self._name.endswith("d")): return self._name[:len(self._name)-1] else: return self._name @@ -1684,19 +1683,24 @@ def generate(kind, names): ["maddw", "msubw", "madd", "msub", "smaddl", "smsubl", "umaddl", "umsubl"]) generate(ThreeRegFloatOp, - [["fabds", "sss"], ["fmuls", "sss"], ["fdivs", "sss"], ["fadds", "sss"], ["fsubs", "sss"], + [["fabdh", "hhh"], ["fmulh", "hhh"], ["fdivh", "hhh"], ["faddh", "hhh"], ["fsubh", "hhh"], + ["fmaxh", "hhh"], ["fminh", "hhh"], ["fnmulh", "hhh"], + ["fabds", "sss"], ["fmuls", "sss"], ["fdivs", "sss"], ["fadds", "sss"], ["fsubs", "sss"], + ["fmaxs", "sss"], ["fmins", "sss"], ["fnmuls", "sss"], ["fabdd", "ddd"], ["fmuld", "ddd"], ["fdivd", "ddd"], ["faddd", "ddd"], ["fsubd", "ddd"], + ["fmaxd", "ddd"], ["fmind", "ddd"], ["fnmuld", "ddd"] ]) generate(FourRegFloatOp, - [["fmadds", "ssss"], ["fmsubs", "ssss"], ["fnmadds", "ssss"], ["fnmadds", "ssss"], - ["fmaddd", "dddd"], ["fmsubd", "dddd"], ["fnmaddd", "dddd"], ["fnmaddd", "dddd"],]) + [["fmaddh", "hhhh"], ["fmadds", "ssss"], ["fmsubs", "ssss"], ["fnmadds", "ssss"], + ["fnmadds", "ssss"], ["fmaddd", "dddd"], ["fmsubd", "dddd"], ["fnmaddd", "dddd"], + ["fnmaddd", "dddd"],]) generate(TwoRegFloatOp, [["fmovs", "ss"], ["fabss", "ss"], ["fnegs", "ss"], ["fsqrts", "ss"], ["fcvts", "ds"], ["fcvtsh", "hs"], ["fcvths", "sh"], ["fmovd", "dd"], ["fabsd", "dd"], ["fnegd", "dd"], ["fsqrtd", "dd"], - ["fcvtd", "sd"], + ["fcvtd", "sd"], ["fsqrth", "hh"] ]) generate(FloatConvertOp, [["fcvtzsw", "fcvtzs", "ws"], ["fcvtzs", "fcvtzs", "xs"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index 0c2011592b6f4..c2e8046213bae 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -484,404 +484,420 @@ __ umsubl(r13, r10, r7, r5); // umsubl x13, w10, w7, x5 // ThreeRegFloatOp - __ fabds(v30, v15, v3); // fabd s30, s15, s3 - __ fmuls(v12, v12, v16); // fmul s12, s12, s16 - __ fdivs(v31, v31, v18); // fdiv s31, s31, s18 - __ fadds(v19, v21, v16); // fadd s19, s21, s16 - __ fsubs(v15, v10, v21); // fsub s15, s10, s21 - __ fabdd(v2, v10, v28); // fabd d2, d10, d28 - __ fmuld(v7, v30, v31); // fmul d7, d30, d31 - __ fdivd(v18, v1, v2); // fdiv d18, d1, d2 - __ faddd(v6, v10, v3); // fadd d6, d10, d3 - __ fsubd(v25, v11, v7); // fsub d25, d11, d7 + __ fabdh(v30, v15, v3); // fabd h30, h15, h3 + __ fmulh(v12, v12, v16); // fmul h12, h12, h16 + __ fdivh(v31, v31, v18); // fdiv h31, h31, h18 + __ faddh(v19, v21, v16); // fadd h19, h21, h16 + __ fsubh(v15, v10, v21); // fsub h15, h10, h21 + __ fmaxh(v2, v10, v28); // fmax h2, h10, h28 + __ fminh(v7, v30, v31); // fmin h7, h30, h31 + __ fnmulh(v18, v1, v2); // fnmul h18, h1, h2 + __ fabds(v6, v10, v3); // fabd s6, s10, s3 + __ fmuls(v25, v11, v7); // fmul s25, s11, s7 + __ fdivs(v1, v12, v0); // fdiv s1, s12, s0 + __ fadds(v3, v19, v29); // fadd s3, s19, s29 + __ fsubs(v6, v23, v6); // fsub s6, s23, s6 + __ fmaxs(v0, v28, v27); // fmax s0, s28, s27 + __ fmins(v2, v5, v7); // fmin s2, s5, s7 + __ fnmuls(v29, v12, v25); // fnmul s29, s12, s25 + __ fabdd(v13, v12, v24); // fabd d13, d12, d24 + __ fmuld(v19, v8, v18); // fmul d19, d8, d18 + __ fdivd(v22, v26, v21); // fdiv d22, d26, d21 + __ faddd(v20, v19, v2); // fadd d20, d19, d2 + __ fsubd(v30, v22, v8); // fsub d30, d22, d8 + __ fmaxd(v22, v19, v21); // fmax d22, d19, d21 + __ fmind(v12, v18, v21); // fmin d12, d18, d21 + __ fnmuld(v6, v16, v3); // fnmul d6, d16, d3 // FourRegFloatOp - __ fmadds(v1, v12, v0, v3); // fmadd s1, s12, s0, s3 - __ fmsubs(v19, v29, v6, v23); // fmsub s19, s29, s6, s23 - __ fnmadds(v6, v0, v28, v27); // fnmadd s6, s0, s28, s27 - __ fnmadds(v2, v5, v7, v29); // fnmadd s2, s5, s7, s29 - __ fmaddd(v12, v25, v13, v12); // fmadd d12, d25, d13, d12 - __ fmsubd(v24, v19, v8, v18); // fmsub d24, d19, d8, d18 - __ fnmaddd(v22, v26, v21, v20); // fnmadd d22, d26, d21, d20 - __ fnmaddd(v19, v2, v30, v22); // fnmadd d19, d2, d30, d22 + __ fmaddh(v3, v29, v3, v28); // fmadd h3, h29, h3, h28 + __ fmadds(v15, v14, v10, v13); // fmadd s15, s14, s10, s13 + __ fmsubs(v12, v18, v10, v26); // fmsub s12, s18, s10, s26 + __ fnmadds(v7, v7, v15, v29); // fnmadd s7, s7, s15, s29 + __ fnmadds(v0, v23, v0, v12); // fnmadd s0, s23, s0, s12 + __ fmaddd(v24, v14, v13, v8); // fmadd d24, d14, d13, d8 + __ fmsubd(v15, v7, v9, v20); // fmsub d15, d7, d9, d20 + __ fnmaddd(v19, v29, v31, v16); // fnmadd d19, d29, d31, d16 + __ fnmaddd(v2, v9, v16, v21); // fnmadd d2, d9, d16, d21 // TwoRegFloatOp - __ fmovs(v8, v22); // fmov s8, s22 - __ fabss(v19, v21); // fabs s19, s21 - __ fnegs(v12, v18); // fneg s12, s18 - __ fsqrts(v21, v6); // fsqrt s21, s6 - __ fcvts(v16, v3); // fcvt d16, s3 - __ fcvtsh(v3, v29); // fcvt h3, s29 - __ fcvths(v3, v28); // fcvt s3, h28 - __ fmovd(v15, v14); // fmov d15, d14 - __ fabsd(v10, v13); // fabs d10, d13 - __ fnegd(v12, v18); // fneg d12, d18 - __ fsqrtd(v10, v26); // fsqrt d10, d26 - __ fcvtd(v7, v7); // fcvt s7, d7 + __ fmovs(v30, v4); // fmov s30, s4 + __ fabss(v1, v27); // fabs s1, s27 + __ fnegs(v25, v24); // fneg s25, s24 + __ fsqrts(v14, v21); // fsqrt s14, s21 + __ fcvts(v13, v6); // fcvt d13, s6 + __ fcvtsh(v12, v25); // fcvt h12, s25 + __ fcvths(v25, v30); // fcvt s25, h30 + __ fmovd(v28, v21); // fmov d28, d21 + __ fabsd(v16, v23); // fabs d16, d23 + __ fnegd(v5, v29); // fneg d5, d29 + __ fsqrtd(v22, v19); // fsqrt d22, d19 + __ fcvtd(v13, v20); // fcvt s13, d20 + __ fsqrth(v19, v28); // fsqrt h19, h28 // FloatConvertOp - __ fcvtzsw(r14, v29); // fcvtzs w14, s29 - __ fcvtzs(r0, v23); // fcvtzs x0, s23 - __ fcvtzdw(r0, v12); // fcvtzs w0, d12 - __ fcvtzd(r23, v14); // fcvtzs x23, d14 - __ scvtfws(v13, r7); // scvtf s13, w7 - __ scvtfs(v15, r7); // scvtf s15, x7 - __ scvtfwd(v9, r20); // scvtf d9, w20 - __ scvtfd(v19, r28); // scvtf d19, x28 - __ fcvtassw(r30, v16); // fcvtas w30, s16 - __ fcvtasd(r2, v9); // fcvtas x2, d9 - __ fcvtmssw(r16, v21); // fcvtms w16, s21 - __ fcvtmsd(r29, v4); // fcvtms x29, d4 - __ fmovs(r1, v27); // fmov w1, s27 - __ fmovd(r24, v24); // fmov x24, d24 - __ fmovs(v14, r21); // fmov s14, w21 - __ fmovd(v13, r5); // fmov d13, x5 + __ fcvtzsw(r17, v6); // fcvtzs w17, s6 + __ fcvtzs(r13, v7); // fcvtzs x13, s7 + __ fcvtzdw(r28, v26); // fcvtzs w28, d26 + __ fcvtzd(r17, v6); // fcvtzs x17, d6 + __ scvtfws(v1, r4); // scvtf s1, w4 + __ scvtfs(v14, r20); // scvtf s14, x20 + __ scvtfwd(v7, r21); // scvtf d7, w21 + __ scvtfd(v27, r23); // scvtf d27, x23 + __ fcvtassw(r13, v20); // fcvtas w13, s20 + __ fcvtasd(r30, v28); // fcvtas x30, d28 + __ fcvtmssw(r10, v21); // fcvtms w10, s21 + __ fcvtmsd(r5, v17); // fcvtms x5, d17 + __ fmovs(r11, v14); // fmov w11, s14 + __ fmovd(r13, v21); // fmov x13, d21 + __ fmovs(v27, r14); // fmov s27, w14 + __ fmovd(v4, r23); // fmov d4, x23 // TwoRegFloatOp - __ fcmps(v12, v25); // fcmp s12, s25 - __ fcmpd(v25, v30); // fcmp d25, d30 - __ fcmps(v28, 0.0); // fcmp s28, #0.0 - __ fcmpd(v21, 0.0); // fcmp d21, #0.0 + __ fcmps(v24, v30); // fcmp s24, s30 + __ fcmpd(v12, v14); // fcmp d12, d14 + __ fcmps(v17, 0.0); // fcmp s17, #0.0 + __ fcmpd(v28, 0.0); // fcmp d28, #0.0 // LoadStorePairOp - __ stpw(r22, r5, Address(r28, -48)); // stp w22, w5, [x28, #-48] - __ ldpw(r19, r27, Address(r19, 16)); // ldp w19, w27, [x19, #16] - __ ldpsw(r28, r26, Address(r7, -32)); // ldpsw x28, x26, [x7, #-32] - __ stp(r6, r1, Address(r4, -48)); // stp x6, x1, [x4, #-48] - __ ldp(r26, r23, Address(r21, -80)); // ldp x26, x23, [x21, #-80] + __ stpw(r0, r6, Address(r26, 16)); // stp w0, w6, [x26, #16] + __ ldpw(r0, r30, Address(r6, -32)); // ldp w0, w30, [x6, #-32] + __ ldpsw(r16, r2, Address(r11, -208)); // ldpsw x16, x2, [x11, #-208] + __ stp(r15, r0, Address(r12, 128)); // stp x15, x0, [x12, #128] + __ ldp(r7, r30, Address(r23, 32)); // ldp x7, x30, [x23, #32] // LoadStorePairOp - __ stpw(r20, r30, Address(__ pre(r9, -96))); // stp w20, w30, [x9, #-96]! - __ ldpw(r13, r20, Address(__ pre(r26, 16))); // ldp w13, w20, [x26, #16]! - __ ldpsw(r29, r11, Address(__ pre(r13, -80))); // ldpsw x29, x11, [x13, #-80]! - __ stp(r27, r21, Address(__ pre(r5, -48))); // stp x27, x21, [x5, #-48]! - __ ldp(r6, r0, Address(__ pre(r30, 80))); // ldp x6, x0, [x30, #80]! + __ stpw(r26, r15, Address(__ pre(r7, -256))); // stp w26, w15, [x7, #-256]! + __ ldpw(r11, r15, Address(__ pre(r10, -32))); // ldp w11, w15, [x10, #-32]! + __ ldpsw(r19, r16, Address(__ pre(r1, 64))); // ldpsw x19, x16, [x1, #64]! + __ stp(r14, r9, Address(__ pre(r0, 128))); // stp x14, x9, [x0, #128]! + __ ldp(r27, r3, Address(__ pre(r12, -96))); // ldp x27, x3, [x12, #-96]! // LoadStorePairOp - __ stpw(r19, r15, Address(__ post(r16, -208))); // stp w19, w15, [x16], #-208 - __ ldpw(r12, r23, Address(__ post(r9, -240))); // ldp w12, w23, [x9], #-240 - __ ldpsw(r0, r26, Address(__ post(r15, 32))); // ldpsw x0, x26, [x15], #32 - __ stp(r8, r17, Address(__ post(r26, -208))); // stp x8, x17, [x26], #-208 - __ ldp(r25, r7, Address(__ post(r2, -176))); // ldp x25, x7, [x2], #-176 + __ stpw(r8, r11, Address(__ post(r12, -256))); // stp w8, w11, [x12], #-256 + __ ldpw(r10, r16, Address(__ post(r4, 64))); // ldp w10, w16, [x4], #64 + __ ldpsw(r10, r30, Address(__ post(r19, -64))); // ldpsw x10, x30, [x19], #-64 + __ stp(r24, r2, Address(__ post(r15, -96))); // stp x24, x2, [x15], #-96 + __ ldp(r24, r10, Address(__ post(r16, 80))); // ldp x24, x10, [x16], #80 // LoadStorePairOp - __ stnpw(r19, r17, Address(r1, -208)); // stnp w19, w17, [x1, #-208] - __ ldnpw(r0, r13, Address(r22, 128)); // ldnp w0, w13, [x22, #128] - __ stnp(r29, r23, Address(r27, 0)); // stnp x29, x23, [x27, #0] - __ ldnp(r11, r10, Address(r8, -224)); // ldnp x11, x10, [x8, #-224] + __ stnpw(r30, r21, Address(r29, 16)); // stnp w30, w21, [x29, #16] + __ ldnpw(r8, r30, Address(r10, -112)); // ldnp w8, w30, [x10, #-112] + __ stnp(r30, r26, Address(r6, -128)); // stnp x30, x26, [x6, #-128] + __ ldnp(r24, r2, Address(r20, 64)); // ldnp x24, x2, [x20, #64] // LdStNEONOp - __ ld1(v0, __ T8B, Address(r11)); // ld1 {v0.8B}, [x11] - __ ld1(v16, v17, __ T16B, Address(__ post(r26, 32))); // ld1 {v16.16B, v17.16B}, [x26], 32 - __ ld1(v22, v23, v24, __ T1D, Address(__ post(r26, r17))); // ld1 {v22.1D, v23.1D, v24.1D}, [x26], x17 - __ ld1(v27, v28, v29, v30, __ T8H, Address(__ post(r29, 64))); // ld1 {v27.8H, v28.8H, v29.8H, v30.8H}, [x29], 64 - __ ld1r(v22, __ T8B, Address(r6)); // ld1r {v22.8B}, [x6] - __ ld1r(v14, __ T4S, Address(__ post(r29, 4))); // ld1r {v14.4S}, [x29], 4 - __ ld1r(v22, __ T1D, Address(__ post(r12, r16))); // ld1r {v22.1D}, [x12], x16 - __ ld2(v1, v2, __ T2D, Address(r0)); // ld2 {v1.2D, v2.2D}, [x0] - __ ld2(v10, v11, __ T4H, Address(__ post(r21, 16))); // ld2 {v10.4H, v11.4H}, [x21], 16 - __ ld2r(v7, v8, __ T16B, Address(r25)); // ld2r {v7.16B, v8.16B}, [x25] - __ ld2r(v9, v10, __ T2S, Address(__ post(r9, 8))); // ld2r {v9.2S, v10.2S}, [x9], 8 - __ ld2r(v9, v10, __ T2D, Address(__ post(r12, r14))); // ld2r {v9.2D, v10.2D}, [x12], x14 - __ ld3(v7, v8, v9, __ T4S, Address(__ post(r4, r17))); // ld3 {v7.4S, v8.4S, v9.4S}, [x4], x17 - __ ld3(v23, v24, v25, __ T2S, Address(r17)); // ld3 {v23.2S, v24.2S, v25.2S}, [x17] - __ ld3r(v4, v5, v6, __ T8H, Address(r22)); // ld3r {v4.8H, v5.8H, v6.8H}, [x22] - __ ld3r(v13, v14, v15, __ T4S, Address(__ post(r2, 12))); // ld3r {v13.4S, v14.4S, v15.4S}, [x2], 12 - __ ld3r(v16, v17, v18, __ T1D, Address(__ post(r10, r12))); // ld3r {v16.1D, v17.1D, v18.1D}, [x10], x12 - __ ld4(v4, v5, v6, v7, __ T8H, Address(__ post(r2, 64))); // ld4 {v4.8H, v5.8H, v6.8H, v7.8H}, [x2], 64 - __ ld4(v6, v7, v8, v9, __ T8B, Address(__ post(r20, r11))); // ld4 {v6.8B, v7.8B, v8.8B, v9.8B}, [x20], x11 - __ ld4r(v12, v13, v14, v15, __ T8B, Address(r12)); // ld4r {v12.8B, v13.8B, v14.8B, v15.8B}, [x12] - __ ld4r(v16, v17, v18, v19, __ T4H, Address(__ post(r17, 8))); // ld4r {v16.4H, v17.4H, v18.4H, v19.4H}, [x17], 8 - __ ld4r(v14, v15, v16, v17, __ T2S, Address(__ post(r25, r16))); // ld4r {v14.2S, v15.2S, v16.2S, v17.2S}, [x25], x16 + __ ld1(v31, __ T8B, Address(r25)); // ld1 {v31.8B}, [x25] + __ ld1(v5, v6, __ T16B, Address(__ post(r15, 32))); // ld1 {v5.16B, v6.16B}, [x15], 32 + __ ld1(v10, v11, v12, __ T1D, Address(__ post(r7, r13))); // ld1 {v10.1D, v11.1D, v12.1D}, [x7], x13 + __ ld1(v13, v14, v15, v16, __ T8H, Address(__ post(r16, 64))); // ld1 {v13.8H, v14.8H, v15.8H, v16.8H}, [x16], 64 + __ ld1r(v7, __ T8B, Address(r17)); // ld1r {v7.8B}, [x17] + __ ld1r(v16, __ T4S, Address(__ post(r25, 4))); // ld1r {v16.4S}, [x25], 4 + __ ld1r(v11, __ T1D, Address(__ post(r3, r7))); // ld1r {v11.1D}, [x3], x7 + __ ld2(v13, v14, __ T2D, Address(r7)); // ld2 {v13.2D, v14.2D}, [x7] + __ ld2(v9, v10, __ T4H, Address(__ post(r27, 16))); // ld2 {v9.4H, v10.4H}, [x27], 16 + __ ld2r(v6, v7, __ T16B, Address(r26)); // ld2r {v6.16B, v7.16B}, [x26] + __ ld2r(v23, v24, __ T2S, Address(__ post(r16, 8))); // ld2r {v23.2S, v24.2S}, [x16], 8 + __ ld2r(v6, v7, __ T2D, Address(__ post(r13, r8))); // ld2r {v6.2D, v7.2D}, [x13], x8 + __ ld3(v20, v21, v22, __ T4S, Address(__ post(r1, r26))); // ld3 {v20.4S, v21.4S, v22.4S}, [x1], x26 + __ ld3(v15, v16, v17, __ T2S, Address(r15)); // ld3 {v15.2S, v16.2S, v17.2S}, [x15] + __ ld3r(v29, v30, v31, __ T8H, Address(r22)); // ld3r {v29.8H, v30.8H, v31.8H}, [x22] + __ ld3r(v6, v7, v8, __ T4S, Address(__ post(r10, 12))); // ld3r {v6.4S, v7.4S, v8.4S}, [x10], 12 + __ ld3r(v15, v16, v17, __ T1D, Address(__ post(r6, r15))); // ld3r {v15.1D, v16.1D, v17.1D}, [x6], x15 + __ ld4(v6, v7, v8, v9, __ T8H, Address(__ post(r10, 64))); // ld4 {v6.8H, v7.8H, v8.8H, v9.8H}, [x10], 64 + __ ld4(v11, v12, v13, v14, __ T8B, Address(__ post(r3, r7))); // ld4 {v11.8B, v12.8B, v13.8B, v14.8B}, [x3], x7 + __ ld4r(v12, v13, v14, v15, __ T8B, Address(r25)); // ld4r {v12.8B, v13.8B, v14.8B, v15.8B}, [x25] + __ ld4r(v11, v12, v13, v14, __ T4H, Address(__ post(r15, 8))); // ld4r {v11.4H, v12.4H, v13.4H, v14.4H}, [x15], 8 + __ ld4r(v30, v31, v0, v1, __ T2S, Address(__ post(r6, r28))); // ld4r {v30.2S, v31.2S, v0.2S, v1.2S}, [x6], x28 // NEONReduceInstruction - __ addv(v20, __ T8B, v21); // addv b20, v21.8B - __ addv(v1, __ T16B, v2); // addv b1, v2.16B - __ addv(v23, __ T4H, v24); // addv h23, v24.4H - __ addv(v30, __ T8H, v31); // addv h30, v31.8H - __ addv(v14, __ T4S, v15); // addv s14, v15.4S - __ smaxv(v2, __ T8B, v3); // smaxv b2, v3.8B - __ smaxv(v6, __ T16B, v7); // smaxv b6, v7.16B - __ smaxv(v3, __ T4H, v4); // smaxv h3, v4.4H - __ smaxv(v8, __ T8H, v9); // smaxv h8, v9.8H - __ smaxv(v25, __ T4S, v26); // smaxv s25, v26.4S - __ fmaxv(v0, __ T4S, v1); // fmaxv s0, v1.4S - __ sminv(v27, __ T8B, v28); // sminv b27, v28.8B - __ uminv(v30, __ T8B, v31); // uminv b30, v31.8B - __ sminv(v5, __ T16B, v6); // sminv b5, v6.16B - __ uminv(v5, __ T16B, v6); // uminv b5, v6.16B - __ sminv(v30, __ T4H, v31); // sminv h30, v31.4H - __ uminv(v11, __ T4H, v12); // uminv h11, v12.4H - __ sminv(v25, __ T8H, v26); // sminv h25, v26.8H - __ uminv(v0, __ T8H, v1); // uminv h0, v1.8H + __ addv(v27, __ T8B, v28); // addv b27, v28.8B + __ addv(v28, __ T16B, v29); // addv b28, v29.16B + __ addv(v1, __ T4H, v2); // addv h1, v2.4H + __ addv(v28, __ T8H, v29); // addv h28, v29.8H + __ addv(v1, __ T4S, v2); // addv s1, v2.4S + __ smaxv(v20, __ T8B, v21); // smaxv b20, v21.8B + __ smaxv(v29, __ T16B, v30); // smaxv b29, v30.16B + __ smaxv(v16, __ T4H, v17); // smaxv h16, v17.4H + __ smaxv(v13, __ T8H, v14); // smaxv h13, v14.8H + __ smaxv(v10, __ T4S, v11); // smaxv s10, v11.4S + __ fmaxv(v29, __ T4S, v30); // fmaxv s29, v30.4S + __ sminv(v29, __ T8B, v30); // sminv b29, v30.8B + __ uminv(v19, __ T8B, v20); // uminv b19, v20.8B + __ sminv(v22, __ T16B, v23); // sminv b22, v23.16B + __ uminv(v10, __ T16B, v11); // uminv b10, v11.16B + __ sminv(v4, __ T4H, v5); // sminv h4, v5.4H + __ uminv(v31, __ T4H, v0); // uminv h31, v0.4H + __ sminv(v21, __ T8H, v22); // sminv h21, v22.8H + __ uminv(v8, __ T8H, v9); // uminv h8, v9.8H __ sminv(v31, __ T4S, v0); // sminv s31, v0.4S - __ uminv(v0, __ T4S, v1); // uminv s0, v1.4S - __ fminv(v19, __ T4S, v20); // fminv s19, v20.4S - __ fmaxp(v29, v30, __ S); // fmaxp s29, v30.2S - __ fmaxp(v26, v27, __ D); // fmaxp d26, v27.2D - __ fminp(v9, v10, __ S); // fminp s9, v10.2S - __ fminp(v26, v27, __ D); // fminp d26, v27.2D + __ uminv(v19, __ T4S, v20); // uminv s19, v20.4S + __ fminv(v10, __ T4S, v11); // fminv s10, v11.4S + __ fmaxp(v28, v29, __ S); // fmaxp s28, v29.2S + __ fmaxp(v2, v3, __ D); // fmaxp d2, v3.2D + __ fminp(v25, v26, __ S); // fminp s25, v26.2S + __ fminp(v5, v6, __ D); // fminp d5, v6.2D // NEONFloatCompareWithZero - __ fcm(Assembler::GT, v12, __ T2S, v13); // fcmgt v12.2S, v13.2S, #0.0 - __ fcm(Assembler::GT, v15, __ T4S, v16); // fcmgt v15.4S, v16.4S, #0.0 - __ fcm(Assembler::GT, v11, __ T2D, v12); // fcmgt v11.2D, v12.2D, #0.0 - __ fcm(Assembler::GE, v11, __ T2S, v12); // fcmge v11.2S, v12.2S, #0.0 - __ fcm(Assembler::GE, v18, __ T4S, v19); // fcmge v18.4S, v19.4S, #0.0 - __ fcm(Assembler::GE, v25, __ T2D, v26); // fcmge v25.2D, v26.2D, #0.0 - __ fcm(Assembler::EQ, v22, __ T2S, v23); // fcmeq v22.2S, v23.2S, #0.0 + __ fcm(Assembler::GT, v3, __ T2S, v4); // fcmgt v3.2S, v4.2S, #0.0 + __ fcm(Assembler::GT, v8, __ T4S, v9); // fcmgt v8.4S, v9.4S, #0.0 + __ fcm(Assembler::GT, v22, __ T2D, v23); // fcmgt v22.2D, v23.2D, #0.0 + __ fcm(Assembler::GE, v19, __ T2S, v20); // fcmge v19.2S, v20.2S, #0.0 + __ fcm(Assembler::GE, v13, __ T4S, v14); // fcmge v13.4S, v14.4S, #0.0 + __ fcm(Assembler::GE, v5, __ T2D, v6); // fcmge v5.2D, v6.2D, #0.0 + __ fcm(Assembler::EQ, v29, __ T2S, v30); // fcmeq v29.2S, v30.2S, #0.0 __ fcm(Assembler::EQ, v24, __ T4S, v25); // fcmeq v24.4S, v25.4S, #0.0 - __ fcm(Assembler::EQ, v0, __ T2D, v1); // fcmeq v0.2D, v1.2D, #0.0 - __ fcm(Assembler::LT, v17, __ T2S, v18); // fcmlt v17.2S, v18.2S, #0.0 - __ fcm(Assembler::LT, v11, __ T4S, v12); // fcmlt v11.4S, v12.4S, #0.0 - __ fcm(Assembler::LT, v6, __ T2D, v7); // fcmlt v6.2D, v7.2D, #0.0 - __ fcm(Assembler::LE, v29, __ T2S, v30); // fcmle v29.2S, v30.2S, #0.0 - __ fcm(Assembler::LE, v6, __ T4S, v7); // fcmle v6.4S, v7.4S, #0.0 - __ fcm(Assembler::LE, v5, __ T2D, v6); // fcmle v5.2D, v6.2D, #0.0 + __ fcm(Assembler::EQ, v21, __ T2D, v22); // fcmeq v21.2D, v22.2D, #0.0 + __ fcm(Assembler::LT, v26, __ T2S, v27); // fcmlt v26.2S, v27.2S, #0.0 + __ fcm(Assembler::LT, v24, __ T4S, v25); // fcmlt v24.4S, v25.4S, #0.0 + __ fcm(Assembler::LT, v3, __ T2D, v4); // fcmlt v3.2D, v4.2D, #0.0 + __ fcm(Assembler::LE, v24, __ T2S, v25); // fcmle v24.2S, v25.2S, #0.0 + __ fcm(Assembler::LE, v26, __ T4S, v27); // fcmle v26.4S, v27.4S, #0.0 + __ fcm(Assembler::LE, v23, __ T2D, v24); // fcmle v23.2D, v24.2D, #0.0 // TwoRegNEONOp - __ absr(v5, __ T8B, v6); // abs v5.8B, v6.8B + __ absr(v15, __ T8B, v16); // abs v15.8B, v16.8B __ absr(v21, __ T16B, v22); // abs v21.16B, v22.16B - __ absr(v19, __ T4H, v20); // abs v19.4H, v20.4H - __ absr(v16, __ T8H, v17); // abs v16.8H, v17.8H - __ absr(v18, __ T2S, v19); // abs v18.2S, v19.2S - __ absr(v30, __ T4S, v31); // abs v30.4S, v31.4S - __ absr(v27, __ T2D, v28); // abs v27.2D, v28.2D - __ fabs(v28, __ T2S, v29); // fabs v28.2S, v29.2S - __ fabs(v1, __ T4S, v2); // fabs v1.4S, v2.4S - __ fabs(v28, __ T2D, v29); // fabs v28.2D, v29.2D + __ absr(v3, __ T4H, v4); // abs v3.4H, v4.4H + __ absr(v24, __ T8H, v25); // abs v24.8H, v25.8H + __ absr(v8, __ T2S, v9); // abs v8.2S, v9.2S + __ absr(v25, __ T4S, v26); // abs v25.4S, v26.4S + __ absr(v20, __ T2D, v21); // abs v20.2D, v21.2D + __ fabs(v16, __ T2S, v17); // fabs v16.2S, v17.2S + __ fabs(v17, __ T4S, v18); // fabs v17.4S, v18.4S + __ fabs(v2, __ T2D, v3); // fabs v2.2D, v3.2D __ fneg(v1, __ T2S, v2); // fneg v1.2S, v2.2S - __ fneg(v20, __ T4S, v21); // fneg v20.4S, v21.4S - __ fneg(v29, __ T2D, v30); // fneg v29.2D, v30.2D - __ fsqrt(v16, __ T2S, v17); // fsqrt v16.2S, v17.2S - __ fsqrt(v13, __ T4S, v14); // fsqrt v13.4S, v14.4S - __ fsqrt(v10, __ T2D, v11); // fsqrt v10.2D, v11.2D - __ notr(v29, __ T8B, v30); // not v29.8B, v30.8B - __ notr(v29, __ T16B, v30); // not v29.16B, v30.16B + __ fneg(v0, __ T4S, v1); // fneg v0.4S, v1.4S + __ fneg(v24, __ T2D, v25); // fneg v24.2D, v25.2D + __ fsqrt(v4, __ T2S, v5); // fsqrt v4.2S, v5.2S + __ fsqrt(v3, __ T4S, v4); // fsqrt v3.4S, v4.4S + __ fsqrt(v12, __ T2D, v13); // fsqrt v12.2D, v13.2D + __ notr(v31, __ T8B, v0); // not v31.8B, v0.8B + __ notr(v28, __ T16B, v29); // not v28.16B, v29.16B // ThreeRegNEONOp - __ andr(v19, __ T8B, v20, v21); // and v19.8B, v20.8B, v21.8B - __ andr(v22, __ T16B, v23, v24); // and v22.16B, v23.16B, v24.16B - __ orr(v10, __ T8B, v11, v12); // orr v10.8B, v11.8B, v12.8B - __ orr(v4, __ T16B, v5, v6); // orr v4.16B, v5.16B, v6.16B - __ eor(v31, __ T8B, v0, v1); // eor v31.8B, v0.8B, v1.8B - __ eor(v21, __ T16B, v22, v23); // eor v21.16B, v22.16B, v23.16B - __ addv(v8, __ T8B, v9, v10); // add v8.8B, v9.8B, v10.8B - __ addv(v31, __ T16B, v0, v1); // add v31.16B, v0.16B, v1.16B - __ addv(v19, __ T4H, v20, v21); // add v19.4H, v20.4H, v21.4H - __ addv(v10, __ T8H, v11, v12); // add v10.8H, v11.8H, v12.8H - __ addv(v28, __ T2S, v29, v30); // add v28.2S, v29.2S, v30.2S - __ addv(v2, __ T4S, v3, v4); // add v2.4S, v3.4S, v4.4S - __ addv(v25, __ T2D, v26, v27); // add v25.2D, v26.2D, v27.2D - __ sqaddv(v5, __ T8B, v6, v7); // sqadd v5.8B, v6.8B, v7.8B - __ sqaddv(v3, __ T16B, v4, v5); // sqadd v3.16B, v4.16B, v5.16B - __ sqaddv(v8, __ T4H, v9, v10); // sqadd v8.4H, v9.4H, v10.4H - __ sqaddv(v22, __ T8H, v23, v24); // sqadd v22.8H, v23.8H, v24.8H - __ sqaddv(v19, __ T2S, v20, v21); // sqadd v19.2S, v20.2S, v21.2S - __ sqaddv(v13, __ T4S, v14, v15); // sqadd v13.4S, v14.4S, v15.4S - __ sqaddv(v5, __ T2D, v6, v7); // sqadd v5.2D, v6.2D, v7.2D - __ uqaddv(v29, __ T8B, v30, v31); // uqadd v29.8B, v30.8B, v31.8B - __ uqaddv(v24, __ T16B, v25, v26); // uqadd v24.16B, v25.16B, v26.16B - __ uqaddv(v21, __ T4H, v22, v23); // uqadd v21.4H, v22.4H, v23.4H - __ uqaddv(v26, __ T8H, v27, v28); // uqadd v26.8H, v27.8H, v28.8H - __ uqaddv(v24, __ T2S, v25, v26); // uqadd v24.2S, v25.2S, v26.2S - __ uqaddv(v3, __ T4S, v4, v5); // uqadd v3.4S, v4.4S, v5.4S - __ uqaddv(v24, __ T2D, v25, v26); // uqadd v24.2D, v25.2D, v26.2D - __ fadd(v26, __ T2S, v27, v28); // fadd v26.2S, v27.2S, v28.2S - __ fadd(v23, __ T4S, v24, v25); // fadd v23.4S, v24.4S, v25.4S - __ fadd(v15, __ T2D, v16, v17); // fadd v15.2D, v16.2D, v17.2D - __ subv(v21, __ T8B, v22, v23); // sub v21.8B, v22.8B, v23.8B - __ subv(v3, __ T16B, v4, v5); // sub v3.16B, v4.16B, v5.16B - __ subv(v24, __ T4H, v25, v26); // sub v24.4H, v25.4H, v26.4H - __ subv(v8, __ T8H, v9, v10); // sub v8.8H, v9.8H, v10.8H - __ subv(v25, __ T2S, v26, v27); // sub v25.2S, v26.2S, v27.2S - __ subv(v20, __ T4S, v21, v22); // sub v20.4S, v21.4S, v22.4S - __ subv(v16, __ T2D, v17, v18); // sub v16.2D, v17.2D, v18.2D - __ sqsubv(v17, __ T8B, v18, v19); // sqsub v17.8B, v18.8B, v19.8B - __ sqsubv(v2, __ T16B, v3, v4); // sqsub v2.16B, v3.16B, v4.16B - __ sqsubv(v1, __ T4H, v2, v3); // sqsub v1.4H, v2.4H, v3.4H - __ sqsubv(v0, __ T8H, v1, v2); // sqsub v0.8H, v1.8H, v2.8H - __ sqsubv(v24, __ T2S, v25, v26); // sqsub v24.2S, v25.2S, v26.2S - __ sqsubv(v4, __ T4S, v5, v6); // sqsub v4.4S, v5.4S, v6.4S - __ sqsubv(v3, __ T2D, v4, v5); // sqsub v3.2D, v4.2D, v5.2D - __ uqsubv(v12, __ T8B, v13, v14); // uqsub v12.8B, v13.8B, v14.8B - __ uqsubv(v31, __ T16B, v0, v1); // uqsub v31.16B, v0.16B, v1.16B - __ uqsubv(v28, __ T4H, v29, v30); // uqsub v28.4H, v29.4H, v30.4H - __ uqsubv(v10, __ T8H, v11, v12); // uqsub v10.8H, v11.8H, v12.8H - __ uqsubv(v26, __ T2S, v27, v28); // uqsub v26.2S, v27.2S, v28.2S - __ uqsubv(v2, __ T4S, v3, v4); // uqsub v2.4S, v3.4S, v4.4S - __ uqsubv(v12, __ T2D, v13, v14); // uqsub v12.2D, v13.2D, v14.2D - __ fsub(v18, __ T2S, v19, v20); // fsub v18.2S, v19.2S, v20.2S - __ fsub(v31, __ T4S, v0, v1); // fsub v31.4S, v0.4S, v1.4S - __ fsub(v1, __ T2D, v2, v3); // fsub v1.2D, v2.2D, v3.2D - __ mulv(v13, __ T8B, v14, v15); // mul v13.8B, v14.8B, v15.8B - __ mulv(v29, __ T16B, v30, v31); // mul v29.16B, v30.16B, v31.16B - __ mulv(v0, __ T4H, v1, v2); // mul v0.4H, v1.4H, v2.4H - __ mulv(v19, __ T8H, v20, v21); // mul v19.8H, v20.8H, v21.8H - __ mulv(v12, __ T2S, v13, v14); // mul v12.2S, v13.2S, v14.2S - __ mulv(v17, __ T4S, v18, v19); // mul v17.4S, v18.4S, v19.4S - __ fabd(v22, __ T2S, v23, v24); // fabd v22.2S, v23.2S, v24.2S - __ fabd(v13, __ T4S, v14, v15); // fabd v13.4S, v14.4S, v15.4S - __ fabd(v28, __ T2D, v29, v30); // fabd v28.2D, v29.2D, v30.2D - __ faddp(v30, __ T2S, v31, v0); // faddp v30.2S, v31.2S, v0.2S - __ faddp(v31, __ T4S, v0, v1); // faddp v31.4S, v0.4S, v1.4S - __ faddp(v1, __ T2D, v2, v3); // faddp v1.2D, v2.2D, v3.2D - __ fmul(v26, __ T2S, v27, v28); // fmul v26.2S, v27.2S, v28.2S - __ fmul(v28, __ T4S, v29, v30); // fmul v28.4S, v29.4S, v30.4S - __ fmul(v4, __ T2D, v5, v6); // fmul v4.2D, v5.2D, v6.2D - __ mlav(v30, __ T4H, v31, v0); // mla v30.4H, v31.4H, v0.4H - __ mlav(v4, __ T8H, v5, v6); // mla v4.8H, v5.8H, v6.8H - __ mlav(v6, __ T2S, v7, v8); // mla v6.2S, v7.2S, v8.2S - __ mlav(v30, __ T4S, v31, v0); // mla v30.4S, v31.4S, v0.4S - __ fmla(v26, __ T2S, v27, v28); // fmla v26.2S, v27.2S, v28.2S - __ fmla(v18, __ T4S, v19, v20); // fmla v18.4S, v19.4S, v20.4S - __ fmla(v9, __ T2D, v10, v11); // fmla v9.2D, v10.2D, v11.2D - __ mlsv(v8, __ T4H, v9, v10); // mls v8.4H, v9.4H, v10.4H - __ mlsv(v12, __ T8H, v13, v14); // mls v12.8H, v13.8H, v14.8H - __ mlsv(v0, __ T2S, v1, v2); // mls v0.2S, v1.2S, v2.2S - __ mlsv(v20, __ T4S, v21, v22); // mls v20.4S, v21.4S, v22.4S - __ fmls(v1, __ T2S, v2, v3); // fmls v1.2S, v2.2S, v3.2S - __ fmls(v24, __ T4S, v25, v26); // fmls v24.4S, v25.4S, v26.4S - __ fmls(v2, __ T2D, v3, v4); // fmls v2.2D, v3.2D, v4.2D - __ fdiv(v0, __ T2S, v1, v2); // fdiv v0.2S, v1.2S, v2.2S - __ fdiv(v9, __ T4S, v10, v11); // fdiv v9.4S, v10.4S, v11.4S - __ fdiv(v24, __ T2D, v25, v26); // fdiv v24.2D, v25.2D, v26.2D - __ maxv(v26, __ T8B, v27, v28); // smax v26.8B, v27.8B, v28.8B + __ andr(v10, __ T8B, v11, v12); // and v10.8B, v11.8B, v12.8B + __ andr(v26, __ T16B, v27, v28); // and v26.16B, v27.16B, v28.16B + __ orr(v2, __ T8B, v3, v4); // orr v2.8B, v3.8B, v4.8B + __ orr(v12, __ T16B, v13, v14); // orr v12.16B, v13.16B, v14.16B + __ eor(v18, __ T8B, v19, v20); // eor v18.8B, v19.8B, v20.8B + __ eor(v31, __ T16B, v0, v1); // eor v31.16B, v0.16B, v1.16B + __ addv(v1, __ T8B, v2, v3); // add v1.8B, v2.8B, v3.8B + __ addv(v13, __ T16B, v14, v15); // add v13.16B, v14.16B, v15.16B + __ addv(v29, __ T4H, v30, v31); // add v29.4H, v30.4H, v31.4H + __ addv(v0, __ T8H, v1, v2); // add v0.8H, v1.8H, v2.8H + __ addv(v19, __ T2S, v20, v21); // add v19.2S, v20.2S, v21.2S + __ addv(v12, __ T4S, v13, v14); // add v12.4S, v13.4S, v14.4S + __ addv(v17, __ T2D, v18, v19); // add v17.2D, v18.2D, v19.2D + __ sqaddv(v22, __ T8B, v23, v24); // sqadd v22.8B, v23.8B, v24.8B + __ sqaddv(v13, __ T16B, v14, v15); // sqadd v13.16B, v14.16B, v15.16B + __ sqaddv(v28, __ T4H, v29, v30); // sqadd v28.4H, v29.4H, v30.4H + __ sqaddv(v30, __ T8H, v31, v0); // sqadd v30.8H, v31.8H, v0.8H + __ sqaddv(v31, __ T2S, v0, v1); // sqadd v31.2S, v0.2S, v1.2S + __ sqaddv(v1, __ T4S, v2, v3); // sqadd v1.4S, v2.4S, v3.4S + __ sqaddv(v26, __ T2D, v27, v28); // sqadd v26.2D, v27.2D, v28.2D + __ uqaddv(v28, __ T8B, v29, v30); // uqadd v28.8B, v29.8B, v30.8B + __ uqaddv(v4, __ T16B, v5, v6); // uqadd v4.16B, v5.16B, v6.16B + __ uqaddv(v30, __ T4H, v31, v0); // uqadd v30.4H, v31.4H, v0.4H + __ uqaddv(v4, __ T8H, v5, v6); // uqadd v4.8H, v5.8H, v6.8H + __ uqaddv(v6, __ T2S, v7, v8); // uqadd v6.2S, v7.2S, v8.2S + __ uqaddv(v30, __ T4S, v31, v0); // uqadd v30.4S, v31.4S, v0.4S + __ uqaddv(v26, __ T2D, v27, v28); // uqadd v26.2D, v27.2D, v28.2D + __ fadd(v18, __ T2S, v19, v20); // fadd v18.2S, v19.2S, v20.2S + __ fadd(v9, __ T4S, v10, v11); // fadd v9.4S, v10.4S, v11.4S + __ fadd(v8, __ T2D, v9, v10); // fadd v8.2D, v9.2D, v10.2D + __ subv(v12, __ T8B, v13, v14); // sub v12.8B, v13.8B, v14.8B + __ subv(v0, __ T16B, v1, v2); // sub v0.16B, v1.16B, v2.16B + __ subv(v20, __ T4H, v21, v22); // sub v20.4H, v21.4H, v22.4H + __ subv(v1, __ T8H, v2, v3); // sub v1.8H, v2.8H, v3.8H + __ subv(v24, __ T2S, v25, v26); // sub v24.2S, v25.2S, v26.2S + __ subv(v2, __ T4S, v3, v4); // sub v2.4S, v3.4S, v4.4S + __ subv(v0, __ T2D, v1, v2); // sub v0.2D, v1.2D, v2.2D + __ sqsubv(v9, __ T8B, v10, v11); // sqsub v9.8B, v10.8B, v11.8B + __ sqsubv(v24, __ T16B, v25, v26); // sqsub v24.16B, v25.16B, v26.16B + __ sqsubv(v26, __ T4H, v27, v28); // sqsub v26.4H, v27.4H, v28.4H + __ sqsubv(v16, __ T8H, v17, v18); // sqsub v16.8H, v17.8H, v18.8H + __ sqsubv(v30, __ T2S, v31, v0); // sqsub v30.2S, v31.2S, v0.2S + __ sqsubv(v3, __ T4S, v4, v5); // sqsub v3.4S, v4.4S, v5.4S + __ sqsubv(v10, __ T2D, v11, v12); // sqsub v10.2D, v11.2D, v12.2D + __ uqsubv(v23, __ T8B, v24, v25); // uqsub v23.8B, v24.8B, v25.8B + __ uqsubv(v10, __ T16B, v11, v12); // uqsub v10.16B, v11.16B, v12.16B + __ uqsubv(v4, __ T4H, v5, v6); // uqsub v4.4H, v5.4H, v6.4H + __ uqsubv(v18, __ T8H, v19, v20); // uqsub v18.8H, v19.8H, v20.8H + __ uqsubv(v2, __ T2S, v3, v4); // uqsub v2.2S, v3.2S, v4.2S + __ uqsubv(v11, __ T4S, v12, v13); // uqsub v11.4S, v12.4S, v13.4S + __ uqsubv(v8, __ T2D, v9, v10); // uqsub v8.2D, v9.2D, v10.2D + __ fsub(v10, __ T2S, v11, v12); // fsub v10.2S, v11.2S, v12.2S + __ fsub(v15, __ T4S, v16, v17); // fsub v15.4S, v16.4S, v17.4S + __ fsub(v17, __ T2D, v18, v19); // fsub v17.2D, v18.2D, v19.2D + __ mulv(v2, __ T8B, v3, v4); // mul v2.8B, v3.8B, v4.8B + __ mulv(v10, __ T16B, v11, v12); // mul v10.16B, v11.16B, v12.16B + __ mulv(v12, __ T4H, v13, v14); // mul v12.4H, v13.4H, v14.4H + __ mulv(v12, __ T8H, v13, v14); // mul v12.8H, v13.8H, v14.8H + __ mulv(v15, __ T2S, v16, v17); // mul v15.2S, v16.2S, v17.2S + __ mulv(v13, __ T4S, v14, v15); // mul v13.4S, v14.4S, v15.4S + __ fabd(v2, __ T2S, v3, v4); // fabd v2.2S, v3.2S, v4.2S + __ fabd(v7, __ T4S, v8, v9); // fabd v7.4S, v8.4S, v9.4S + __ fabd(v20, __ T2D, v21, v22); // fabd v20.2D, v21.2D, v22.2D + __ faddp(v26, __ T2S, v27, v28); // faddp v26.2S, v27.2S, v28.2S + __ faddp(v16, __ T4S, v17, v18); // faddp v16.4S, v17.4S, v18.4S + __ faddp(v4, __ T2D, v5, v6); // faddp v4.2D, v5.2D, v6.2D + __ fmul(v2, __ T2S, v3, v4); // fmul v2.2S, v3.2S, v4.2S + __ fmul(v4, __ T4S, v5, v6); // fmul v4.4S, v5.4S, v6.4S + __ fmul(v12, __ T2D, v13, v14); // fmul v12.2D, v13.2D, v14.2D + __ mlav(v18, __ T4H, v19, v20); // mla v18.4H, v19.4H, v20.4H + __ mlav(v21, __ T8H, v22, v23); // mla v21.8H, v22.8H, v23.8H + __ mlav(v16, __ T2S, v17, v18); // mla v16.2S, v17.2S, v18.2S + __ mlav(v18, __ T4S, v19, v20); // mla v18.4S, v19.4S, v20.4S + __ fmla(v11, __ T2S, v12, v13); // fmla v11.2S, v12.2S, v13.2S + __ fmla(v21, __ T4S, v22, v23); // fmla v21.4S, v22.4S, v23.4S + __ fmla(v23, __ T2D, v24, v25); // fmla v23.2D, v24.2D, v25.2D + __ mlsv(v12, __ T4H, v13, v14); // mls v12.4H, v13.4H, v14.4H + __ mlsv(v26, __ T8H, v27, v28); // mls v26.8H, v27.8H, v28.8H + __ mlsv(v23, __ T2S, v24, v25); // mls v23.2S, v24.2S, v25.2S + __ mlsv(v28, __ T4S, v29, v30); // mls v28.4S, v29.4S, v30.4S + __ fmls(v14, __ T2S, v15, v16); // fmls v14.2S, v15.2S, v16.2S + __ fmls(v11, __ T4S, v12, v13); // fmls v11.4S, v12.4S, v13.4S + __ fmls(v24, __ T2D, v25, v26); // fmls v24.2D, v25.2D, v26.2D + __ fdiv(v1, __ T2S, v2, v3); // fdiv v1.2S, v2.2S, v3.2S + __ fdiv(v12, __ T4S, v13, v14); // fdiv v12.4S, v13.4S, v14.4S + __ fdiv(v31, __ T2D, v0, v1); // fdiv v31.2D, v0.2D, v1.2D + __ maxv(v10, __ T8B, v11, v12); // smax v10.8B, v11.8B, v12.8B __ maxv(v16, __ T16B, v17, v18); // smax v16.16B, v17.16B, v18.16B - __ maxv(v30, __ T4H, v31, v0); // smax v30.4H, v31.4H, v0.4H - __ maxv(v3, __ T8H, v4, v5); // smax v3.8H, v4.8H, v5.8H - __ maxv(v10, __ T2S, v11, v12); // smax v10.2S, v11.2S, v12.2S - __ maxv(v23, __ T4S, v24, v25); // smax v23.4S, v24.4S, v25.4S - __ umaxv(v10, __ T8B, v11, v12); // umax v10.8B, v11.8B, v12.8B - __ umaxv(v4, __ T16B, v5, v6); // umax v4.16B, v5.16B, v6.16B - __ umaxv(v18, __ T4H, v19, v20); // umax v18.4H, v19.4H, v20.4H - __ umaxv(v2, __ T8H, v3, v4); // umax v2.8H, v3.8H, v4.8H - __ umaxv(v11, __ T2S, v12, v13); // umax v11.2S, v12.2S, v13.2S - __ umaxv(v8, __ T4S, v9, v10); // umax v8.4S, v9.4S, v10.4S - __ smaxp(v10, __ T8B, v11, v12); // smaxp v10.8B, v11.8B, v12.8B - __ smaxp(v15, __ T16B, v16, v17); // smaxp v15.16B, v16.16B, v17.16B - __ smaxp(v17, __ T4H, v18, v19); // smaxp v17.4H, v18.4H, v19.4H - __ smaxp(v2, __ T8H, v3, v4); // smaxp v2.8H, v3.8H, v4.8H - __ smaxp(v10, __ T2S, v11, v12); // smaxp v10.2S, v11.2S, v12.2S - __ smaxp(v12, __ T4S, v13, v14); // smaxp v12.4S, v13.4S, v14.4S - __ fmax(v12, __ T2S, v13, v14); // fmax v12.2S, v13.2S, v14.2S - __ fmax(v15, __ T4S, v16, v17); // fmax v15.4S, v16.4S, v17.4S - __ fmax(v13, __ T2D, v14, v15); // fmax v13.2D, v14.2D, v15.2D - __ minv(v2, __ T8B, v3, v4); // smin v2.8B, v3.8B, v4.8B - __ minv(v7, __ T16B, v8, v9); // smin v7.16B, v8.16B, v9.16B - __ minv(v20, __ T4H, v21, v22); // smin v20.4H, v21.4H, v22.4H - __ minv(v26, __ T8H, v27, v28); // smin v26.8H, v27.8H, v28.8H - __ minv(v16, __ T2S, v17, v18); // smin v16.2S, v17.2S, v18.2S - __ minv(v4, __ T4S, v5, v6); // smin v4.4S, v5.4S, v6.4S - __ uminv(v2, __ T8B, v3, v4); // umin v2.8B, v3.8B, v4.8B - __ uminv(v4, __ T16B, v5, v6); // umin v4.16B, v5.16B, v6.16B - __ uminv(v12, __ T4H, v13, v14); // umin v12.4H, v13.4H, v14.4H - __ uminv(v18, __ T8H, v19, v20); // umin v18.8H, v19.8H, v20.8H - __ uminv(v21, __ T2S, v22, v23); // umin v21.2S, v22.2S, v23.2S - __ uminv(v16, __ T4S, v17, v18); // umin v16.4S, v17.4S, v18.4S - __ sminp(v18, __ T8B, v19, v20); // sminp v18.8B, v19.8B, v20.8B - __ sminp(v11, __ T16B, v12, v13); // sminp v11.16B, v12.16B, v13.16B - __ sminp(v21, __ T4H, v22, v23); // sminp v21.4H, v22.4H, v23.4H - __ sminp(v23, __ T8H, v24, v25); // sminp v23.8H, v24.8H, v25.8H - __ sminp(v12, __ T2S, v13, v14); // sminp v12.2S, v13.2S, v14.2S - __ sminp(v26, __ T4S, v27, v28); // sminp v26.4S, v27.4S, v28.4S - __ sqdmulh(v23, __ T4H, v24, v25); // sqdmulh v23.4H, v24.4H, v25.4H - __ sqdmulh(v28, __ T8H, v29, v30); // sqdmulh v28.8H, v29.8H, v30.8H - __ sqdmulh(v14, __ T2S, v15, v16); // sqdmulh v14.2S, v15.2S, v16.2S - __ sqdmulh(v11, __ T4S, v12, v13); // sqdmulh v11.4S, v12.4S, v13.4S - __ shsubv(v24, __ T8B, v25, v26); // shsub v24.8B, v25.8B, v26.8B - __ shsubv(v1, __ T16B, v2, v3); // shsub v1.16B, v2.16B, v3.16B - __ shsubv(v12, __ T4H, v13, v14); // shsub v12.4H, v13.4H, v14.4H - __ shsubv(v31, __ T8H, v0, v1); // shsub v31.8H, v0.8H, v1.8H - __ shsubv(v10, __ T2S, v11, v12); // shsub v10.2S, v11.2S, v12.2S - __ shsubv(v16, __ T4S, v17, v18); // shsub v16.4S, v17.4S, v18.4S - __ fmin(v7, __ T2S, v8, v9); // fmin v7.2S, v8.2S, v9.2S - __ fmin(v2, __ T4S, v3, v4); // fmin v2.4S, v3.4S, v4.4S - __ fmin(v3, __ T2D, v4, v5); // fmin v3.2D, v4.2D, v5.2D - __ facgt(v13, __ T2S, v14, v15); // facgt v13.2S, v14.2S, v15.2S - __ facgt(v19, __ T4S, v20, v21); // facgt v19.4S, v20.4S, v21.4S - __ facgt(v17, __ T2D, v18, v19); // facgt v17.2D, v18.2D, v19.2D + __ maxv(v7, __ T4H, v8, v9); // smax v7.4H, v8.4H, v9.4H + __ maxv(v2, __ T8H, v3, v4); // smax v2.8H, v3.8H, v4.8H + __ maxv(v3, __ T2S, v4, v5); // smax v3.2S, v4.2S, v5.2S + __ maxv(v13, __ T4S, v14, v15); // smax v13.4S, v14.4S, v15.4S + __ umaxv(v19, __ T8B, v20, v21); // umax v19.8B, v20.8B, v21.8B + __ umaxv(v17, __ T16B, v18, v19); // umax v17.16B, v18.16B, v19.16B + __ umaxv(v16, __ T4H, v17, v18); // umax v16.4H, v17.4H, v18.4H + __ umaxv(v3, __ T8H, v4, v5); // umax v3.8H, v4.8H, v5.8H + __ umaxv(v1, __ T2S, v2, v3); // umax v1.2S, v2.2S, v3.2S + __ umaxv(v11, __ T4S, v12, v13); // umax v11.4S, v12.4S, v13.4S + __ smaxp(v30, __ T8B, v31, v0); // smaxp v30.8B, v31.8B, v0.8B + __ smaxp(v5, __ T16B, v6, v7); // smaxp v5.16B, v6.16B, v7.16B + __ smaxp(v8, __ T4H, v9, v10); // smaxp v8.4H, v9.4H, v10.4H + __ smaxp(v15, __ T8H, v16, v17); // smaxp v15.8H, v16.8H, v17.8H + __ smaxp(v29, __ T2S, v30, v31); // smaxp v29.2S, v30.2S, v31.2S + __ smaxp(v30, __ T4S, v31, v0); // smaxp v30.4S, v31.4S, v0.4S + __ fmax(v0, __ T2S, v1, v2); // fmax v0.2S, v1.2S, v2.2S + __ fmax(v20, __ T4S, v21, v22); // fmax v20.4S, v21.4S, v22.4S + __ fmax(v7, __ T2D, v8, v9); // fmax v7.2D, v8.2D, v9.2D + __ minv(v20, __ T8B, v21, v22); // smin v20.8B, v21.8B, v22.8B + __ minv(v23, __ T16B, v24, v25); // smin v23.16B, v24.16B, v25.16B + __ minv(v28, __ T4H, v29, v30); // smin v28.4H, v29.4H, v30.4H + __ minv(v21, __ T8H, v22, v23); // smin v21.8H, v22.8H, v23.8H + __ minv(v27, __ T2S, v28, v29); // smin v27.2S, v28.2S, v29.2S + __ minv(v25, __ T4S, v26, v27); // smin v25.4S, v26.4S, v27.4S + __ uminv(v5, __ T8B, v6, v7); // umin v5.8B, v6.8B, v7.8B + __ uminv(v1, __ T16B, v2, v3); // umin v1.16B, v2.16B, v3.16B + __ uminv(v23, __ T4H, v24, v25); // umin v23.4H, v24.4H, v25.4H + __ uminv(v16, __ T8H, v17, v18); // umin v16.8H, v17.8H, v18.8H + __ uminv(v31, __ T2S, v0, v1); // umin v31.2S, v0.2S, v1.2S + __ uminv(v5, __ T4S, v6, v7); // umin v5.4S, v6.4S, v7.4S + __ sminp(v12, __ T8B, v13, v14); // sminp v12.8B, v13.8B, v14.8B + __ sminp(v9, __ T16B, v10, v11); // sminp v9.16B, v10.16B, v11.16B + __ sminp(v28, __ T4H, v29, v30); // sminp v28.4H, v29.4H, v30.4H + __ sminp(v15, __ T8H, v16, v17); // sminp v15.8H, v16.8H, v17.8H + __ sminp(v29, __ T2S, v30, v31); // sminp v29.2S, v30.2S, v31.2S + __ sminp(v22, __ T4S, v23, v24); // sminp v22.4S, v23.4S, v24.4S + __ sqdmulh(v31, __ T4H, v0, v1); // sqdmulh v31.4H, v0.4H, v1.4H + __ sqdmulh(v19, __ T8H, v20, v21); // sqdmulh v19.8H, v20.8H, v21.8H + __ sqdmulh(v31, __ T2S, v0, v1); // sqdmulh v31.2S, v0.2S, v1.2S + __ sqdmulh(v5, __ T4S, v6, v7); // sqdmulh v5.4S, v6.4S, v7.4S + __ shsubv(v14, __ T8B, v15, v16); // shsub v14.8B, v15.8B, v16.8B + __ shsubv(v18, __ T16B, v19, v20); // shsub v18.16B, v19.16B, v20.16B + __ shsubv(v31, __ T4H, v0, v1); // shsub v31.4H, v0.4H, v1.4H + __ shsubv(v18, __ T8H, v19, v20); // shsub v18.8H, v19.8H, v20.8H + __ shsubv(v27, __ T2S, v28, v29); // shsub v27.2S, v28.2S, v29.2S + __ shsubv(v20, __ T4S, v21, v22); // shsub v20.4S, v21.4S, v22.4S + __ fmin(v16, __ T2S, v17, v18); // fmin v16.2S, v17.2S, v18.2S + __ fmin(v12, __ T4S, v13, v14); // fmin v12.4S, v13.4S, v14.4S + __ fmin(v11, __ T2D, v12, v13); // fmin v11.2D, v12.2D, v13.2D + __ facgt(v9, __ T2S, v10, v11); // facgt v9.2S, v10.2S, v11.2S + __ facgt(v6, __ T4S, v7, v8); // facgt v6.4S, v7.4S, v8.4S + __ facgt(v30, __ T2D, v31, v0); // facgt v30.2D, v31.2D, v0.2D // VectorScalarNEONInstruction - __ fmlavs(v1, __ T2S, v2, v3, 1); // fmla v1.2S, v2.2S, v3.S[1] - __ mulvs(v5, __ T4S, v6, v7, 0); // mul v5.4S, v6.4S, v7.S[0] - __ fmlavs(v2, __ T2D, v3, v4, 1); // fmla v2.2D, v3.2D, v4.D[1] - __ fmlsvs(v7, __ T2S, v8, v9, 0); // fmls v7.2S, v8.2S, v9.S[0] + __ fmlavs(v13, __ T2S, v14, v15, 1); // fmla v13.2S, v14.2S, v15.S[1] __ mulvs(v15, __ T4S, v0, v1, 3); // mul v15.4S, v0.4S, v1.S[3] - __ fmlsvs(v10, __ T2D, v11, v12, 0); // fmls v10.2D, v11.2D, v12.D[0] - __ fmulxvs(v10, __ T2S, v11, v12, 0); // fmulx v10.2S, v11.2S, v12.S[0] - __ mulvs(v14, __ T4S, v15, v16, 2); // mul v14.4S, v15.4S, v16.S[2] - __ fmulxvs(v13, __ T2D, v14, v15, 1); // fmulx v13.2D, v14.2D, v15.D[1] - __ mulvs(v2, __ T4H, v3, v4, 3); // mul v2.4H, v3.4H, v4.H[3] - __ mulvs(v11, __ T8H, v12, v13, 0); // mul v11.8H, v12.8H, v13.H[0] - __ mulvs(v15, __ T2S, v0, v1, 1); // mul v15.2S, v0.2S, v1.S[1] - __ mulvs(v6, __ T4S, v7, v8, 0); // mul v6.4S, v7.4S, v8.S[0] + __ fmlavs(v5, __ T2D, v6, v7, 0); // fmla v5.2D, v6.2D, v7.D[0] + __ fmlsvs(v5, __ T2S, v6, v7, 1); // fmls v5.2S, v6.2S, v7.S[1] + __ mulvs(v12, __ T4S, v13, v14, 0); // mul v12.4S, v13.4S, v14.S[0] + __ fmlsvs(v8, __ T2D, v9, v10, 1); // fmls v8.2D, v9.2D, v10.D[1] + __ fmulxvs(v1, __ T2S, v2, v3, 1); // fmulx v1.2S, v2.2S, v3.S[1] + __ mulvs(v7, __ T4S, v8, v9, 3); // mul v7.4S, v8.4S, v9.S[3] + __ fmulxvs(v9, __ T2D, v10, v11, 1); // fmulx v9.2D, v10.2D, v11.D[1] + __ mulvs(v11, __ T4H, v12, v13, 2); // mul v11.4H, v12.4H, v13.H[2] + __ mulvs(v7, __ T8H, v8, v9, 0); // mul v7.8H, v8.8H, v9.H[0] + __ mulvs(v6, __ T2S, v7, v8, 0); // mul v6.2S, v7.2S, v8.S[0] + __ mulvs(v5, __ T4S, v6, v7, 2); // mul v5.4S, v6.4S, v7.S[2] // NEONVectorCompare - __ cm(Assembler::GT, v9, __ T8B, v10, v11); // cmgt v9.8B, v10.8B, v11.8B - __ cm(Assembler::GT, v28, __ T16B, v29, v30); // cmgt v28.16B, v29.16B, v30.16B - __ cm(Assembler::GT, v15, __ T4H, v16, v17); // cmgt v15.4H, v16.4H, v17.4H - __ cm(Assembler::GT, v29, __ T8H, v30, v31); // cmgt v29.8H, v30.8H, v31.8H - __ cm(Assembler::GT, v22, __ T2S, v23, v24); // cmgt v22.2S, v23.2S, v24.2S - __ cm(Assembler::GT, v31, __ T4S, v0, v1); // cmgt v31.4S, v0.4S, v1.4S - __ cm(Assembler::GT, v19, __ T2D, v20, v21); // cmgt v19.2D, v20.2D, v21.2D - __ cm(Assembler::GE, v31, __ T8B, v0, v1); // cmge v31.8B, v0.8B, v1.8B - __ cm(Assembler::GE, v5, __ T16B, v6, v7); // cmge v5.16B, v6.16B, v7.16B - __ cm(Assembler::GE, v14, __ T4H, v15, v16); // cmge v14.4H, v15.4H, v16.4H - __ cm(Assembler::GE, v18, __ T8H, v19, v20); // cmge v18.8H, v19.8H, v20.8H - __ cm(Assembler::GE, v31, __ T2S, v0, v1); // cmge v31.2S, v0.2S, v1.2S - __ cm(Assembler::GE, v18, __ T4S, v19, v20); // cmge v18.4S, v19.4S, v20.4S - __ cm(Assembler::GE, v27, __ T2D, v28, v29); // cmge v27.2D, v28.2D, v29.2D - __ cm(Assembler::EQ, v20, __ T8B, v21, v22); // cmeq v20.8B, v21.8B, v22.8B - __ cm(Assembler::EQ, v16, __ T16B, v17, v18); // cmeq v16.16B, v17.16B, v18.16B - __ cm(Assembler::EQ, v12, __ T4H, v13, v14); // cmeq v12.4H, v13.4H, v14.4H - __ cm(Assembler::EQ, v11, __ T8H, v12, v13); // cmeq v11.8H, v12.8H, v13.8H - __ cm(Assembler::EQ, v9, __ T2S, v10, v11); // cmeq v9.2S, v10.2S, v11.2S - __ cm(Assembler::EQ, v6, __ T4S, v7, v8); // cmeq v6.4S, v7.4S, v8.4S - __ cm(Assembler::EQ, v30, __ T2D, v31, v0); // cmeq v30.2D, v31.2D, v0.2D - __ cm(Assembler::HI, v17, __ T8B, v18, v19); // cmhi v17.8B, v18.8B, v19.8B - __ cm(Assembler::HI, v27, __ T16B, v28, v29); // cmhi v27.16B, v28.16B, v29.16B - __ cm(Assembler::HI, v28, __ T4H, v29, v30); // cmhi v28.4H, v29.4H, v30.4H - __ cm(Assembler::HI, v30, __ T8H, v31, v0); // cmhi v30.8H, v31.8H, v0.8H - __ cm(Assembler::HI, v7, __ T2S, v8, v9); // cmhi v7.2S, v8.2S, v9.2S - __ cm(Assembler::HI, v10, __ T4S, v11, v12); // cmhi v10.4S, v11.4S, v12.4S - __ cm(Assembler::HI, v20, __ T2D, v21, v22); // cmhi v20.2D, v21.2D, v22.2D - __ cm(Assembler::HS, v10, __ T8B, v11, v12); // cmhs v10.8B, v11.8B, v12.8B - __ cm(Assembler::HS, v4, __ T16B, v5, v6); // cmhs v4.16B, v5.16B, v6.16B - __ cm(Assembler::HS, v24, __ T4H, v25, v26); // cmhs v24.4H, v25.4H, v26.4H - __ cm(Assembler::HS, v17, __ T8H, v18, v19); // cmhs v17.8H, v18.8H, v19.8H - __ cm(Assembler::HS, v17, __ T2S, v18, v19); // cmhs v17.2S, v18.2S, v19.2S - __ cm(Assembler::HS, v22, __ T4S, v23, v24); // cmhs v22.4S, v23.4S, v24.4S - __ cm(Assembler::HS, v3, __ T2D, v4, v5); // cmhs v3.2D, v4.2D, v5.2D - __ fcm(Assembler::EQ, v29, __ T2S, v30, v31); // fcmeq v29.2S, v30.2S, v31.2S - __ fcm(Assembler::EQ, v15, __ T4S, v16, v17); // fcmeq v15.4S, v16.4S, v17.4S - __ fcm(Assembler::EQ, v22, __ T2D, v23, v24); // fcmeq v22.2D, v23.2D, v24.2D + __ cm(Assembler::GT, v13, __ T8B, v14, v15); // cmgt v13.8B, v14.8B, v15.8B + __ cm(Assembler::GT, v23, __ T16B, v24, v25); // cmgt v23.16B, v24.16B, v25.16B + __ cm(Assembler::GT, v1, __ T4H, v2, v3); // cmgt v1.4H, v2.4H, v3.4H + __ cm(Assembler::GT, v30, __ T8H, v31, v0); // cmgt v30.8H, v31.8H, v0.8H + __ cm(Assembler::GT, v19, __ T2S, v20, v21); // cmgt v19.2S, v20.2S, v21.2S + __ cm(Assembler::GT, v5, __ T4S, v6, v7); // cmgt v5.4S, v6.4S, v7.4S + __ cm(Assembler::GT, v17, __ T2D, v18, v19); // cmgt v17.2D, v18.2D, v19.2D + __ cm(Assembler::GE, v2, __ T8B, v3, v4); // cmge v2.8B, v3.8B, v4.8B + __ cm(Assembler::GE, v16, __ T16B, v17, v18); // cmge v16.16B, v17.16B, v18.16B + __ cm(Assembler::GE, v22, __ T4H, v23, v24); // cmge v22.4H, v23.4H, v24.4H + __ cm(Assembler::GE, v13, __ T8H, v14, v15); // cmge v13.8H, v14.8H, v15.8H + __ cm(Assembler::GE, v10, __ T2S, v11, v12); // cmge v10.2S, v11.2S, v12.2S + __ cm(Assembler::GE, v21, __ T4S, v22, v23); // cmge v21.4S, v22.4S, v23.4S + __ cm(Assembler::GE, v29, __ T2D, v30, v31); // cmge v29.2D, v30.2D, v31.2D + __ cm(Assembler::EQ, v27, __ T8B, v28, v29); // cmeq v27.8B, v28.8B, v29.8B + __ cm(Assembler::EQ, v12, __ T16B, v13, v14); // cmeq v12.16B, v13.16B, v14.16B + __ cm(Assembler::EQ, v27, __ T4H, v28, v29); // cmeq v27.4H, v28.4H, v29.4H + __ cm(Assembler::EQ, v3, __ T8H, v4, v5); // cmeq v3.8H, v4.8H, v5.8H + __ cm(Assembler::EQ, v1, __ T2S, v2, v3); // cmeq v1.2S, v2.2S, v3.2S + __ cm(Assembler::EQ, v31, __ T4S, v0, v1); // cmeq v31.4S, v0.4S, v1.4S + __ cm(Assembler::EQ, v24, __ T2D, v25, v26); // cmeq v24.2D, v25.2D, v26.2D + __ cm(Assembler::HI, v19, __ T8B, v20, v21); // cmhi v19.8B, v20.8B, v21.8B + __ cm(Assembler::HI, v17, __ T16B, v18, v19); // cmhi v17.16B, v18.16B, v19.16B + __ cm(Assembler::HI, v9, __ T4H, v10, v11); // cmhi v9.4H, v10.4H, v11.4H + __ cm(Assembler::HI, v28, __ T8H, v29, v30); // cmhi v28.8H, v29.8H, v30.8H + __ cm(Assembler::HI, v27, __ T2S, v28, v29); // cmhi v27.2S, v28.2S, v29.2S + __ cm(Assembler::HI, v15, __ T4S, v16, v17); // cmhi v15.4S, v16.4S, v17.4S + __ cm(Assembler::HI, v7, __ T2D, v8, v9); // cmhi v7.2D, v8.2D, v9.2D + __ cm(Assembler::HS, v21, __ T8B, v22, v23); // cmhs v21.8B, v22.8B, v23.8B + __ cm(Assembler::HS, v23, __ T16B, v24, v25); // cmhs v23.16B, v24.16B, v25.16B + __ cm(Assembler::HS, v31, __ T4H, v0, v1); // cmhs v31.4H, v0.4H, v1.4H + __ cm(Assembler::HS, v25, __ T8H, v26, v27); // cmhs v25.8H, v26.8H, v27.8H + __ cm(Assembler::HS, v2, __ T2S, v3, v4); // cmhs v2.2S, v3.2S, v4.2S + __ cm(Assembler::HS, v31, __ T4S, v0, v1); // cmhs v31.4S, v0.4S, v1.4S + __ cm(Assembler::HS, v27, __ T2D, v28, v29); // cmhs v27.2D, v28.2D, v29.2D + __ fcm(Assembler::EQ, v18, __ T2S, v19, v20); // fcmeq v18.2S, v19.2S, v20.2S + __ fcm(Assembler::EQ, v10, __ T4S, v11, v12); // fcmeq v10.4S, v11.4S, v12.4S + __ fcm(Assembler::EQ, v23, __ T2D, v24, v25); // fcmeq v23.2D, v24.2D, v25.2D __ fcm(Assembler::GT, v19, __ T2S, v20, v21); // fcmgt v19.2S, v20.2S, v21.2S - __ fcm(Assembler::GT, v19, __ T4S, v20, v21); // fcmgt v19.4S, v20.4S, v21.4S - __ fcm(Assembler::GT, v22, __ T2D, v23, v24); // fcmgt v22.2D, v23.2D, v24.2D - __ fcm(Assembler::GE, v2, __ T2S, v3, v4); // fcmge v2.2S, v3.2S, v4.2S - __ fcm(Assembler::GE, v15, __ T4S, v16, v17); // fcmge v15.4S, v16.4S, v17.4S - __ fcm(Assembler::GE, v6, __ T2D, v7, v8); // fcmge v6.2D, v7.2D, v8.2D + __ fcm(Assembler::GT, v3, __ T4S, v4, v5); // fcmgt v3.4S, v4.4S, v5.4S + __ fcm(Assembler::GT, v18, __ T2D, v19, v20); // fcmgt v18.2D, v19.2D, v20.2D + __ fcm(Assembler::GE, v0, __ T2S, v1, v2); // fcmge v0.2S, v1.2S, v2.2S + __ fcm(Assembler::GE, v25, __ T4S, v26, v27); // fcmge v25.4S, v26.4S, v27.4S + __ fcm(Assembler::GE, v26, __ T2D, v27, v28); // fcmge v26.2D, v27.2D, v28.2D // SVEComparisonWithZero - __ sve_fcm(Assembler::EQ, p6, __ S, p3, z16, 0.0); // fcmeq p6.s, p3/z, z16.s, #0.0 - __ sve_fcm(Assembler::GT, p11, __ D, p4, z1, 0.0); // fcmgt p11.d, p4/z, z1.d, #0.0 - __ sve_fcm(Assembler::GE, p2, __ S, p4, z17, 0.0); // fcmge p2.s, p4/z, z17.s, #0.0 - __ sve_fcm(Assembler::LT, p11, __ S, p5, z13, 0.0); // fcmlt p11.s, p5/z, z13.s, #0.0 - __ sve_fcm(Assembler::LE, p14, __ S, p6, z27, 0.0); // fcmle p14.s, p6/z, z27.s, #0.0 - __ sve_fcm(Assembler::NE, p1, __ D, p6, z1, 0.0); // fcmne p1.d, p6/z, z1.d, #0.0 + __ sve_fcm(Assembler::EQ, p11, __ D, p3, z2, 0.0); // fcmeq p11.d, p3/z, z2.d, #0.0 + __ sve_fcm(Assembler::GT, p2, __ D, p7, z28, 0.0); // fcmgt p2.d, p7/z, z28.d, #0.0 + __ sve_fcm(Assembler::GE, p8, __ S, p2, z27, 0.0); // fcmge p8.s, p2/z, z27.s, #0.0 + __ sve_fcm(Assembler::LT, p14, __ S, p1, z18, 0.0); // fcmlt p14.s, p1/z, z18.s, #0.0 + __ sve_fcm(Assembler::LE, p3, __ S, p5, z15, 0.0); // fcmle p3.s, p5/z, z15.s, #0.0 + __ sve_fcm(Assembler::NE, p4, __ D, p5, z2, 0.0); // fcmne p4.d, p5/z, z2.d, #0.0 // SVEComparisonWithImm - __ sve_cmp(Assembler::EQ, p9, __ H, p7, z17, 11); // cmpeq p9.h, p7/z, z17.h, #11 - __ sve_cmp(Assembler::GT, p7, __ S, p5, z7, 15); // cmpgt p7.s, p5/z, z7.s, #15 - __ sve_cmp(Assembler::GE, p12, __ D, p6, z2, 2); // cmpge p12.d, p6/z, z2.d, #2 - __ sve_cmp(Assembler::LT, p5, __ S, p0, z23, 2); // cmplt p5.s, p0/z, z23.s, #2 - __ sve_cmp(Assembler::LE, p0, __ D, p5, z25, -14); // cmple p0.d, p5/z, z25.d, #-14 - __ sve_cmp(Assembler::NE, p9, __ B, p7, z12, 14); // cmpne p9.b, p7/z, z12.b, #14 - __ sve_cmp(Assembler::HS, p14, __ D, p1, z16, 37); // cmphs p14.d, p1/z, z16.d, #37 - __ sve_cmp(Assembler::HI, p14, __ B, p1, z18, 29); // cmphi p14.b, p1/z, z18.b, #29 - __ sve_cmp(Assembler::LS, p7, __ S, p2, z9, 10); // cmpls p7.s, p2/z, z9.s, #10 - __ sve_cmp(Assembler::LO, p14, __ D, p1, z21, 111); // cmplo p14.d, p1/z, z21.d, #111 + __ sve_cmp(Assembler::EQ, p15, __ D, p0, z5, 1); // cmpeq p15.d, p0/z, z5.d, #1 + __ sve_cmp(Assembler::GT, p7, __ D, p2, z4, 12); // cmpgt p7.d, p2/z, z4.d, #12 + __ sve_cmp(Assembler::GE, p11, __ D, p6, z27, 7); // cmpge p11.d, p6/z, z27.d, #7 + __ sve_cmp(Assembler::LT, p0, __ B, p4, z4, -16); // cmplt p0.b, p4/z, z4.b, #-16 + __ sve_cmp(Assembler::LE, p2, __ B, p2, z15, -9); // cmple p2.b, p2/z, z15.b, #-9 + __ sve_cmp(Assembler::NE, p2, __ D, p1, z10, 4); // cmpne p2.d, p1/z, z10.d, #4 + __ sve_cmp(Assembler::HS, p11, __ B, p2, z21, 34); // cmphs p11.b, p2/z, z21.b, #34 + __ sve_cmp(Assembler::HI, p8, __ B, p4, z31, 8); // cmphi p8.b, p4/z, z31.b, #8 + __ sve_cmp(Assembler::LS, p6, __ D, p0, z30, 109); // cmpls p6.d, p0/z, z30.d, #109 + __ sve_cmp(Assembler::LO, p11, __ H, p3, z29, 114); // cmplo p11.h, p3/z, z29.h, #114 // SpecialCases __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE @@ -1136,239 +1152,239 @@ __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 // LSEOp - __ swp(Assembler::xword, r0, r17, r15); // swp x0, x17, [x15] - __ ldadd(Assembler::xword, r4, r26, r8); // ldadd x4, x26, [x8] - __ ldbic(Assembler::xword, r28, r22, r27); // ldclr x28, x22, [x27] - __ ldeor(Assembler::xword, r27, r25, r23); // ldeor x27, x25, [x23] - __ ldorr(Assembler::xword, r0, r4, r6); // ldset x0, x4, [x6] - __ ldsmin(Assembler::xword, r16, r0, r4); // ldsmin x16, x0, [x4] - __ ldsmax(Assembler::xword, r15, r1, r10); // ldsmax x15, x1, [x10] - __ ldumin(Assembler::xword, r7, r5, r10); // ldumin x7, x5, [x10] - __ ldumax(Assembler::xword, r28, r7, r20); // ldumax x28, x7, [x20] + __ swp(Assembler::xword, r17, r24, r5); // swp x17, x24, [x5] + __ ldadd(Assembler::xword, r2, r14, r10); // ldadd x2, x14, [x10] + __ ldbic(Assembler::xword, r16, r11, r27); // ldclr x16, x11, [x27] + __ ldeor(Assembler::xword, r23, r12, r4); // ldeor x23, x12, [x4] + __ ldorr(Assembler::xword, r22, r17, r4); // ldset x22, x17, [x4] + __ ldsmin(Assembler::xword, r1, r19, r16); // ldsmin x1, x19, [x16] + __ ldsmax(Assembler::xword, r16, r13, r14); // ldsmax x16, x13, [x14] + __ ldumin(Assembler::xword, r12, r2, r17); // ldumin x12, x2, [x17] + __ ldumax(Assembler::xword, r3, r21, r23); // ldumax x3, x21, [x23] // LSEOp - __ swpa(Assembler::xword, r23, r21, r6); // swpa x23, x21, [x6] - __ ldadda(Assembler::xword, r11, r8, r17); // ldadda x11, x8, [x17] - __ ldbica(Assembler::xword, zr, r6, r17); // ldclra xzr, x6, [x17] - __ ldeora(Assembler::xword, r2, r12, r30); // ldeora x2, x12, [x30] - __ ldorra(Assembler::xword, r29, r3, r27); // ldseta x29, x3, [x27] - __ ldsmina(Assembler::xword, r22, r29, r14); // ldsmina x22, x29, [x14] - __ ldsmaxa(Assembler::xword, r13, r28, r17); // ldsmaxa x13, x28, [x17] - __ ldumina(Assembler::xword, r24, r5, r2); // ldumina x24, x5, [x2] - __ ldumaxa(Assembler::xword, r14, r10, r16); // ldumaxa x14, x10, [x16] + __ swpa(Assembler::xword, r5, r6, r7); // swpa x5, x6, [x7] + __ ldadda(Assembler::xword, r19, r13, r28); // ldadda x19, x13, [x28] + __ ldbica(Assembler::xword, r17, r16, r6); // ldclra x17, x16, [x6] + __ ldeora(Assembler::xword, r2, r29, r3); // ldeora x2, x29, [x3] + __ ldorra(Assembler::xword, r4, r6, r15); // ldseta x4, x6, [x15] + __ ldsmina(Assembler::xword, r20, r13, r12); // ldsmina x20, x13, [x12] + __ ldsmaxa(Assembler::xword, r20, r8, r25); // ldsmaxa x20, x8, [x25] + __ ldumina(Assembler::xword, r20, r19, r0); // ldumina x20, x19, [x0] + __ ldumaxa(Assembler::xword, r11, r24, r6); // ldumaxa x11, x24, [x6] // LSEOp - __ swpal(Assembler::xword, r11, r27, r23); // swpal x11, x27, [x23] - __ ldaddal(Assembler::xword, r12, r4, r22); // ldaddal x12, x4, [x22] - __ ldbical(Assembler::xword, r17, r4, r1); // ldclral x17, x4, [x1] - __ ldeoral(Assembler::xword, r19, r16, r15); // ldeoral x19, x16, [x15] - __ ldorral(Assembler::xword, r13, r14, r12); // ldsetal x13, x14, [x12] - __ ldsminal(Assembler::xword, r2, r17, r3); // ldsminal x2, x17, [x3] - __ ldsmaxal(Assembler::xword, r21, r23, r5); // ldsmaxal x21, x23, [x5] - __ lduminal(Assembler::xword, r6, r7, r19); // lduminal x6, x7, [x19] - __ ldumaxal(Assembler::xword, r13, r28, r17); // ldumaxal x13, x28, [x17] + __ swpal(Assembler::xword, r20, zr, r14); // swpal x20, xzr, [x14] + __ ldaddal(Assembler::xword, r16, r6, r0); // ldaddal x16, x6, [x0] + __ ldbical(Assembler::xword, r7, r15, r19); // ldclral x7, x15, [x19] + __ ldeoral(Assembler::xword, r26, r9, r10); // ldeoral x26, x9, [x10] + __ ldorral(Assembler::xword, r23, r21, r22); // ldsetal x23, x21, [x22] + __ ldsminal(Assembler::xword, r28, r2, r3); // ldsminal x28, x2, [x3] + __ ldsmaxal(Assembler::xword, r15, r19, r20); // ldsmaxal x15, x19, [x20] + __ lduminal(Assembler::xword, r7, r4, r29); // lduminal x7, x4, [x29] + __ ldumaxal(Assembler::xword, r7, r0, r9); // ldumaxal x7, x0, [x9] // LSEOp - __ swpl(Assembler::xword, r16, r6, r2); // swpl x16, x6, [x2] - __ ldaddl(Assembler::xword, r29, r3, r4); // ldaddl x29, x3, [x4] - __ ldbicl(Assembler::xword, r6, r16, r20); // ldclrl x6, x16, [x20] - __ ldeorl(Assembler::xword, r13, r12, r20); // ldeorl x13, x12, [x20] - __ ldorrl(Assembler::xword, r8, r25, r20); // ldsetl x8, x25, [x20] - __ ldsminl(Assembler::xword, r19, r0, r11); // ldsminl x19, x0, [x11] - __ ldsmaxl(Assembler::xword, r24, r6, r20); // ldsmaxl x24, x6, [x20] - __ lduminl(Assembler::xword, zr, r14, r16); // lduminl xzr, x14, [x16] - __ ldumaxl(Assembler::xword, r6, r0, r7); // ldumaxl x6, x0, [x7] + __ swpl(Assembler::xword, r16, r20, r23); // swpl x16, x20, [x23] + __ ldaddl(Assembler::xword, r4, r16, r10); // ldaddl x4, x16, [x10] + __ ldbicl(Assembler::xword, r23, r11, r25); // ldclrl x23, x11, [x25] + __ ldeorl(Assembler::xword, r6, zr, r16); // ldeorl x6, xzr, [x16] + __ ldorrl(Assembler::xword, r13, r23, r12); // ldsetl x13, x23, [x12] + __ ldsminl(Assembler::xword, r1, r14, r9); // ldsminl x1, x14, [x9] + __ ldsmaxl(Assembler::xword, r21, r16, r26); // ldsmaxl x21, x16, [x26] + __ lduminl(Assembler::xword, r15, r4, r4); // lduminl x15, x4, [x4] + __ ldumaxl(Assembler::xword, r16, r8, r6); // ldumaxl x16, x8, [x6] // LSEOp - __ swp(Assembler::word, r15, r19, r26); // swp w15, w19, [x26] - __ ldadd(Assembler::word, r9, r10, r23); // ldadd w9, w10, [x23] - __ ldbic(Assembler::word, r21, r22, r28); // ldclr w21, w22, [x28] - __ ldeor(Assembler::word, r2, r3, r15); // ldeor w2, w3, [x15] - __ ldorr(Assembler::word, r19, r20, r7); // ldset w19, w20, [x7] - __ ldsmin(Assembler::word, r4, r29, r7); // ldsmin w4, w29, [x7] - __ ldsmax(Assembler::word, r0, r9, r16); // ldsmax w0, w9, [x16] - __ ldumin(Assembler::word, r20, r23, r4); // ldumin w20, w23, [x4] - __ ldumax(Assembler::word, r16, r10, r23); // ldumax w16, w10, [x23] + __ swp(Assembler::word, r30, r4, r29); // swp w30, w4, [x29] + __ ldadd(Assembler::word, r17, r29, r26); // ldadd w17, w29, [x26] + __ ldbic(Assembler::word, r9, r15, r2); // ldclr w9, w15, [x2] + __ ldeor(Assembler::word, r11, r29, r3); // ldeor w11, w29, [x3] + __ ldorr(Assembler::word, r7, r1, r27); // ldset w7, w1, [x27] + __ ldsmin(Assembler::word, r21, r16, r14); // ldsmin w21, w16, [x14] + __ ldsmax(Assembler::word, r8, r16, r22); // ldsmax w8, w16, [x22] + __ ldumin(Assembler::word, r25, r5, r20); // ldumin w25, w5, [x20] + __ ldumax(Assembler::word, r21, r16, r23); // ldumax w21, w16, [x23] // LSEOp - __ swpa(Assembler::word, r11, r25, r6); // swpa w11, w25, [x6] - __ ldadda(Assembler::word, zr, r16, r13); // ldadda wzr, w16, [x13] - __ ldbica(Assembler::word, r23, r12, r1); // ldclra w23, w12, [x1] - __ ldeora(Assembler::word, r14, r9, r21); // ldeora w14, w9, [x21] - __ ldorra(Assembler::word, r16, r26, r15); // ldseta w16, w26, [x15] - __ ldsmina(Assembler::word, r4, r4, r15); // ldsmina w4, w4, [x15] - __ ldsmaxa(Assembler::word, r8, r6, r30); // ldsmaxa w8, w6, [x30] - __ ldumina(Assembler::word, r4, r29, r17); // ldumina w4, w29, [x17] - __ ldumaxa(Assembler::word, r29, r26, r9); // ldumaxa w29, w26, [x9] + __ swpa(Assembler::word, r16, r30, r20); // swpa w16, w30, [x20] + __ ldadda(Assembler::word, r20, r0, r4); // ldadda w20, w0, [x4] + __ ldbica(Assembler::word, r19, r24, r4); // ldclra w19, w24, [x4] + __ ldeora(Assembler::word, r20, r4, r24); // ldeora w20, w4, [x24] + __ ldorra(Assembler::word, r26, r19, r2); // ldseta w26, w19, [x2] + __ ldsmina(Assembler::word, r8, r8, r14); // ldsmina w8, w8, [x14] + __ ldsmaxa(Assembler::word, r24, r16, sp); // ldsmaxa w24, w16, [sp] + __ ldumina(Assembler::word, r22, r4, sp); // ldumina w22, w4, [sp] + __ ldumaxa(Assembler::word, r1, r10, r20); // ldumaxa w1, w10, [x20] // LSEOp - __ swpal(Assembler::word, r15, r2, r11); // swpal w15, w2, [x11] - __ ldaddal(Assembler::word, r29, r3, r7); // ldaddal w29, w3, [x7] - __ ldbical(Assembler::word, r1, r27, r21); // ldclral w1, w27, [x21] - __ ldeoral(Assembler::word, r16, r14, r8); // ldeoral w16, w14, [x8] - __ ldorral(Assembler::word, r16, r22, r25); // ldsetal w16, w22, [x25] - __ ldsminal(Assembler::word, r5, r20, r21); // ldsminal w5, w20, [x21] - __ ldsmaxal(Assembler::word, r16, r23, r16); // ldsmaxal w16, w23, [x16] - __ lduminal(Assembler::word, r30, r20, r20); // lduminal w30, w20, [x20] - __ ldumaxal(Assembler::word, r0, r4, r19); // ldumaxal w0, w4, [x19] + __ swpal(Assembler::word, r12, r0, r9); // swpal w12, w0, [x9] + __ ldaddal(Assembler::word, r7, r24, r15); // ldaddal w7, w24, [x15] + __ ldbical(Assembler::word, r4, r27, r6); // ldclral w4, w27, [x6] + __ ldeoral(Assembler::word, r10, r27, r24); // ldeoral w10, w27, [x24] + __ ldorral(Assembler::word, r13, r16, sp); // ldsetal w13, w16, [sp] + __ ldsminal(Assembler::word, r22, r22, r20); // ldsminal w22, w22, [x20] + __ ldsmaxal(Assembler::word, zr, r29, r9); // ldsmaxal wzr, w29, [x9] + __ lduminal(Assembler::word, r14, r20, r7); // lduminal w14, w20, [x7] + __ ldumaxal(Assembler::word, r20, r28, r9); // ldumaxal w20, w28, [x9] // LSEOp - __ swpl(Assembler::word, r24, r4, r20); // swpl w24, w4, [x20] - __ ldaddl(Assembler::word, r4, r24, r26); // ldaddl w4, w24, [x26] - __ ldbicl(Assembler::word, r19, r2, r8); // ldclrl w19, w2, [x8] - __ ldeorl(Assembler::word, r8, r14, r24); // ldeorl w8, w14, [x24] - __ ldorrl(Assembler::word, r16, zr, r22); // ldsetl w16, wzr, [x22] - __ ldsminl(Assembler::word, r4, zr, r1); // ldsminl w4, wzr, [x1] - __ ldsmaxl(Assembler::word, r10, r20, r12); // ldsmaxl w10, w20, [x12] - __ lduminl(Assembler::word, r0, r9, r7); // lduminl w0, w9, [x7] - __ ldumaxl(Assembler::word, r24, r16, r4); // ldumaxl w24, w16, [x4] + __ swpl(Assembler::word, r11, r14, r12); // swpl w11, w14, [x12] + __ ldaddl(Assembler::word, r20, r1, r24); // ldaddl w20, w1, [x24] + __ ldbicl(Assembler::word, r9, r19, r13); // ldclrl w9, w19, [x13] + __ ldeorl(Assembler::word, r19, r16, r16); // ldeorl w19, w16, [x16] + __ ldorrl(Assembler::word, r5, r0, r3); // ldsetl w5, w0, [x3] + __ ldsminl(Assembler::word, r12, r8, r15); // ldsminl w12, w8, [x15] + __ ldsmaxl(Assembler::word, r15, r16, r4); // ldsmaxl w15, w16, [x4] + __ lduminl(Assembler::word, r15, r30, r5); // lduminl w15, w30, [x5] + __ ldumaxl(Assembler::word, r0, r10, r22); // ldumaxl w0, w10, [x22] // SHA3SIMDOp - __ bcax(v27, __ T16B, v6, v10, v27); // bcax v27.16B, v6.16B, v10.16B, v27.16B - __ eor3(v24, __ T16B, v13, v16, v31); // eor3 v24.16B, v13.16B, v16.16B, v31.16B - __ rax1(v22, __ T2D, v22, v20); // rax1 v22.2D, v22.2D, v20.2D - __ xar(v31, __ T2D, v29, v9, 28); // xar v31.2D, v29.2D, v9.2D, #28 + __ bcax(v27, __ T16B, v3, v0, v9); // bcax v27.16B, v3.16B, v0.16B, v9.16B + __ eor3(v19, __ T16B, v29, v10, v24); // eor3 v19.16B, v29.16B, v10.16B, v24.16B + __ rax1(v4, __ T2D, v20, v7); // rax1 v4.2D, v20.2D, v7.2D + __ xar(v24, __ T2D, v29, v14, 43); // xar v24.2D, v29.2D, v14.2D, #43 // SHA512SIMDOp - __ sha512h(v20, __ T2D, v7, v20); // sha512h q20, q7, v20.2D - __ sha512h2(v28, __ T2D, v9, v11); // sha512h2 q28, q9, v11.2D - __ sha512su0(v14, __ T2D, v12); // sha512su0 v14.2D, v12.2D - __ sha512su1(v20, __ T2D, v1, v24); // sha512su1 v20.2D, v1.2D, v24.2D + __ sha512h(v11, __ T2D, v27, v13); // sha512h q11, q27, v13.2D + __ sha512h2(v18, __ T2D, v31, v17); // sha512h2 q18, q31, v17.2D + __ sha512su0(v14, __ T2D, v3); // sha512su0 v14.2D, v3.2D + __ sha512su1(v30, __ T2D, v16, v22); // sha512su1 v30.2D, v16.2D, v22.2D // SVEBinaryImmOp - __ sve_add(z9, __ S, 108u); // add z9.s, z9.s, #0x6c - __ sve_sub(z19, __ S, 132u); // sub z19.s, z19.s, #0x84 - __ sve_and(z5, __ B, 124u); // and z5.b, z5.b, #0x7c - __ sve_eor(z8, __ H, 32768u); // eor z8.h, z8.h, #0x8000 - __ sve_orr(z4, __ H, 508u); // orr z4.h, z4.h, #0x1fc + __ sve_add(z20, __ B, 163u); // add z20.b, z20.b, #0xa3 + __ sve_sub(z3, __ B, 215u); // sub z3.b, z3.b, #0xd7 + __ sve_and(z19, __ H, 33279u); // and z19.h, z19.h, #0x81ff + __ sve_eor(z21, __ B, 12u); // eor z21.b, z21.b, #0xc + __ sve_orr(z24, __ H, 8064u); // orr z24.h, z24.h, #0x1f80 // SVEBinaryImmOp - __ sve_add(z0, __ H, 181u); // add z0.h, z0.h, #0xb5 - __ sve_sub(z27, __ B, 7u); // sub z27.b, z27.b, #0x7 - __ sve_and(z9, __ S, 130023424u); // and z9.s, z9.s, #0x7c00000 - __ sve_eor(z24, __ B, 62u); // eor z24.b, z24.b, #0x3e - __ sve_orr(z24, __ D, 18428729675200069887u); // orr z24.d, z24.d, #0xffc00000000000ff + __ sve_add(z21, __ H, 139u); // add z21.h, z21.h, #0x8b + __ sve_sub(z30, __ H, 26u); // sub z30.h, z30.h, #0x1a + __ sve_and(z3, __ S, 122880u); // and z3.s, z3.s, #0x1e000 + __ sve_eor(z24, __ D, 18158513714670600195u); // eor z24.d, z24.d, #0xfc000003fc000003 + __ sve_orr(z23, __ B, 191u); // orr z23.b, z23.b, #0xbf // SVEBinaryImmOp - __ sve_add(z11, __ D, 104u); // add z11.d, z11.d, #0x68 - __ sve_sub(z18, __ D, 142u); // sub z18.d, z18.d, #0x8e - __ sve_and(z14, __ B, 131u); // and z14.b, z14.b, #0x83 - __ sve_eor(z22, __ S, 4042322160u); // eor z22.s, z22.s, #0xf0f0f0f0 - __ sve_orr(z3, __ B, 225u); // orr z3.b, z3.b, #0xe1 + __ sve_add(z14, __ B, 66u); // add z14.b, z14.b, #0x42 + __ sve_sub(z26, __ B, 180u); // sub z26.b, z26.b, #0xb4 + __ sve_and(z18, __ S, 253952u); // and z18.s, z18.s, #0x3e000 + __ sve_eor(z9, __ S, 16744448u); // eor z9.s, z9.s, #0xff8000 + __ sve_orr(z12, __ H, 33279u); // orr z12.h, z12.h, #0x81ff // SVEBinaryImmOp - __ sve_add(z9, __ S, 142u); // add z9.s, z9.s, #0x8e - __ sve_sub(z21, __ B, 36u); // sub z21.b, z21.b, #0x24 - __ sve_and(z2, __ D, 8796093020160u); // and z2.d, z2.d, #0x7fffffff800 - __ sve_eor(z11, __ S, 3221229567u); // eor z11.s, z11.s, #0xc0000fff - __ sve_orr(z30, __ H, 126u); // orr z30.h, z30.h, #0x7e + __ sve_add(z11, __ H, 206u); // add z11.h, z11.h, #0xce + __ sve_sub(z18, __ D, 154u); // sub z18.d, z18.d, #0x9a + __ sve_and(z9, __ S, 4294459391u); // and z9.s, z9.s, #0xfff83fff + __ sve_eor(z23, __ D, 562675075514368u); // eor z23.d, z23.d, #0x1ffc000000000 + __ sve_orr(z8, __ B, 243u); // orr z8.b, z8.b, #0xf3 // SVEBinaryImmOp - __ sve_add(z23, __ H, 29u); // add z23.h, z23.h, #0x1d - __ sve_sub(z24, __ D, 26u); // sub z24.d, z24.d, #0x1a - __ sve_and(z19, __ S, 4294049777u); // and z19.s, z19.s, #0xfff1fff1 - __ sve_eor(z21, __ H, 1008u); // eor z21.h, z21.h, #0x3f0 - __ sve_orr(z26, __ B, 131u); // orr z26.b, z26.b, #0x83 + __ sve_add(z10, __ B, 121u); // add z10.b, z10.b, #0x79 + __ sve_sub(z25, __ S, 172u); // sub z25.s, z25.s, #0xac + __ sve_and(z0, __ B, 239u); // and z0.b, z0.b, #0xef + __ sve_eor(z5, __ D, 17870287719452639231u); // eor z5.d, z5.d, #0xf80003ffffffffff + __ sve_orr(z17, __ B, 128u); // orr z17.b, z17.b, #0x80 // SVEBinaryImmOp - __ sve_add(z17, __ B, 31u); // add z17.b, z17.b, #0x1f - __ sve_sub(z9, __ S, 97u); // sub z9.s, z9.s, #0x61 - __ sve_and(z8, __ H, 49155u); // and z8.h, z8.h, #0xc003 - __ sve_eor(z17, __ H, 57855u); // eor z17.h, z17.h, #0xe1ff - __ sve_orr(z18, __ D, 2251799811588096u); // orr z18.d, z18.d, #0x7ffffffe00000 + __ sve_add(z30, __ H, 3u); // add z30.h, z30.h, #0x3 + __ sve_sub(z18, __ B, 253u); // sub z18.b, z18.b, #0xfd + __ sve_and(z21, __ S, 4294965263u); // and z21.s, z21.s, #0xfffff80f + __ sve_eor(z12, __ H, 1u); // eor z12.h, z12.h, #0x1 + __ sve_orr(z15, __ S, 1u); // orr z15.s, z15.s, #0x1 // SVEVectorOp - __ sve_add(z16, __ S, z15, z27); // add z16.s, z15.s, z27.s - __ sve_sub(z28, __ H, z22, z8); // sub z28.h, z22.h, z8.h - __ sve_fadd(z5, __ S, z28, z28); // fadd z5.s, z28.s, z28.s - __ sve_fmul(z0, __ D, z15, z25); // fmul z0.d, z15.d, z25.d - __ sve_fsub(z21, __ D, z0, z3); // fsub z21.d, z0.d, z3.d - __ sve_sqadd(z26, __ D, z5, z26); // sqadd z26.d, z5.d, z26.d - __ sve_sqsub(z19, __ H, z17, z1); // sqsub z19.h, z17.h, z1.h - __ sve_uqadd(z14, __ B, z30, z14); // uqadd z14.b, z30.b, z14.b - __ sve_uqsub(z18, __ S, z2, z31); // uqsub z18.s, z2.s, z31.s - __ sve_abs(z23, __ H, p5, z30); // abs z23.h, p5/m, z30.h - __ sve_add(z8, __ H, p0, z0); // add z8.h, p0/m, z8.h, z0.h - __ sve_and(z23, __ S, p5, z0); // and z23.s, p5/m, z23.s, z0.s - __ sve_asr(z26, __ H, p6, z24); // asr z26.h, p6/m, z26.h, z24.h - __ sve_bic(z22, __ B, p5, z2); // bic z22.b, p5/m, z22.b, z2.b - __ sve_clz(z11, __ S, p5, z12); // clz z11.s, p5/m, z12.s - __ sve_cnt(z24, __ D, p6, z9); // cnt z24.d, p6/m, z9.d - __ sve_eor(z17, __ S, p5, z20); // eor z17.s, p5/m, z17.s, z20.s - __ sve_lsl(z4, __ D, p5, z13); // lsl z4.d, p5/m, z4.d, z13.d - __ sve_lsr(z22, __ D, p7, z31); // lsr z22.d, p7/m, z22.d, z31.d - __ sve_mul(z18, __ H, p4, z15); // mul z18.h, p4/m, z18.h, z15.h - __ sve_neg(z13, __ B, p7, z20); // neg z13.b, p7/m, z20.b - __ sve_not(z1, __ B, p3, z14); // not z1.b, p3/m, z14.b - __ sve_orr(z7, __ S, p2, z12); // orr z7.s, p2/m, z7.s, z12.s - __ sve_rbit(z4, __ B, p6, z15); // rbit z4.b, p6/m, z15.b - __ sve_revb(z3, __ S, p7, z1); // revb z3.s, p7/m, z1.s - __ sve_smax(z5, __ D, p5, z31); // smax z5.d, p5/m, z5.d, z31.d - __ sve_smin(z13, __ H, p3, z9); // smin z13.h, p3/m, z13.h, z9.h - __ sve_umax(z30, __ D, p0, z15); // umax z30.d, p0/m, z30.d, z15.d - __ sve_umin(z3, __ H, p0, z26); // umin z3.h, p0/m, z3.h, z26.h - __ sve_sub(z25, __ D, p2, z1); // sub z25.d, p2/m, z25.d, z1.d - __ sve_fabs(z10, __ D, p3, z1); // fabs z10.d, p3/m, z1.d - __ sve_fadd(z26, __ D, p1, z29); // fadd z26.d, p1/m, z26.d, z29.d - __ sve_fdiv(z17, __ S, p1, z28); // fdiv z17.s, p1/m, z17.s, z28.s - __ sve_fmax(z1, __ S, p7, z11); // fmax z1.s, p7/m, z1.s, z11.s - __ sve_fmin(z1, __ D, p0, z1); // fmin z1.d, p0/m, z1.d, z1.d - __ sve_fmul(z27, __ S, p3, z2); // fmul z27.s, p3/m, z27.s, z2.s - __ sve_fneg(z30, __ S, p4, z25); // fneg z30.s, p4/m, z25.s - __ sve_frintm(z2, __ D, p6, z3); // frintm z2.d, p6/m, z3.d - __ sve_frintn(z29, __ D, p3, z3); // frintn z29.d, p3/m, z3.d - __ sve_frintp(z14, __ D, p4, z28); // frintp z14.d, p4/m, z28.d - __ sve_fsqrt(z4, __ D, p2, z27); // fsqrt z4.d, p2/m, z27.d - __ sve_fsub(z2, __ D, p4, z1); // fsub z2.d, p4/m, z2.d, z1.d - __ sve_fmad(z7, __ D, p5, z31, z28); // fmad z7.d, p5/m, z31.d, z28.d - __ sve_fmla(z10, __ S, p5, z17, z29); // fmla z10.s, p5/m, z17.s, z29.s - __ sve_fmls(z22, __ S, p1, z12, z24); // fmls z22.s, p1/m, z12.s, z24.s - __ sve_fmsb(z9, __ S, p2, z11, z0); // fmsb z9.s, p2/m, z11.s, z0.s - __ sve_fnmad(z23, __ S, p5, z20, z4); // fnmad z23.s, p5/m, z20.s, z4.s - __ sve_fnmsb(z15, __ D, p3, z4, z30); // fnmsb z15.d, p3/m, z4.d, z30.d - __ sve_fnmla(z27, __ S, p1, z21, z26); // fnmla z27.s, p1/m, z21.s, z26.s - __ sve_fnmls(z31, __ S, p0, z25, z4); // fnmls z31.s, p0/m, z25.s, z4.s - __ sve_mla(z6, __ D, p0, z21, z7); // mla z6.d, p0/m, z21.d, z7.d - __ sve_mls(z24, __ S, p7, z24, z31); // mls z24.s, p7/m, z24.s, z31.s - __ sve_and(z1, z10, z12); // and z1.d, z10.d, z12.d - __ sve_eor(z13, z8, z25); // eor z13.d, z8.d, z25.d - __ sve_orr(z1, z31, z23); // orr z1.d, z31.d, z23.d - __ sve_bic(z20, z0, z21); // bic z20.d, z0.d, z21.d - __ sve_uzp1(z31, __ S, z29, z27); // uzp1 z31.s, z29.s, z27.s - __ sve_uzp2(z8, __ S, z29, z26); // uzp2 z8.s, z29.s, z26.s - __ sve_fabd(z5, __ D, p1, z18); // fabd z5.d, p1/m, z5.d, z18.d - __ sve_bext(z13, __ H, z26, z21); // bext z13.h, z26.h, z21.h - __ sve_bdep(z0, __ D, z19, z10); // bdep z0.d, z19.d, z10.d - __ sve_eor3(z7, z17, z6); // eor3 z7.d, z7.d, z17.d, z6.d - __ sve_sqadd(z20, __ H, p6, z28); // sqadd z20.h, p6/m, z20.h, z28.h - __ sve_sqsub(z17, __ H, p3, z19); // sqsub z17.h, p3/m, z17.h, z19.h - __ sve_uqadd(z26, __ B, p2, z24); // uqadd z26.b, p2/m, z26.b, z24.b - __ sve_uqsub(z11, __ S, p3, z28); // uqsub z11.s, p3/m, z11.s, z28.s + __ sve_add(z19, __ D, z26, z27); // add z19.d, z26.d, z27.d + __ sve_sub(z13, __ B, z22, z22); // sub z13.b, z22.b, z22.b + __ sve_fadd(z1, __ S, z11, z20); // fadd z1.s, z11.s, z20.s + __ sve_fmul(z20, __ S, z24, z24); // fmul z20.s, z24.s, z24.s + __ sve_fsub(z31, __ D, z17, z20); // fsub z31.d, z17.d, z20.d + __ sve_sqadd(z21, __ H, z4, z21); // sqadd z21.h, z4.h, z21.h + __ sve_sqsub(z30, __ D, z22, z31); // sqsub z30.d, z22.d, z31.d + __ sve_uqadd(z26, __ H, z18, z19); // uqadd z26.h, z18.h, z19.h + __ sve_uqsub(z11, __ S, z13, z29); // uqsub z11.s, z13.s, z29.s + __ sve_abs(z5, __ H, p0, z14); // abs z5.h, p0/m, z14.h + __ sve_add(z2, __ H, p1, z10); // add z2.h, p1/m, z2.h, z10.h + __ sve_and(z19, __ H, p1, z26); // and z19.h, p1/m, z19.h, z26.h + __ sve_asr(z2, __ B, p0, z30); // asr z2.b, p0/m, z2.b, z30.b + __ sve_bic(z20, __ D, p1, z20); // bic z20.d, p1/m, z20.d, z20.d + __ sve_clz(z29, __ H, p3, z13); // clz z29.h, p3/m, z13.h + __ sve_cnt(z14, __ H, p7, z1); // cnt z14.h, p7/m, z1.h + __ sve_eor(z28, __ D, p0, z3); // eor z28.d, p0/m, z28.d, z3.d + __ sve_lsl(z9, __ B, p6, z9); // lsl z9.b, p6/m, z9.b, z9.b + __ sve_lsr(z26, __ B, p2, z14); // lsr z26.b, p2/m, z26.b, z14.b + __ sve_mul(z20, __ D, p6, z7); // mul z20.d, p6/m, z20.d, z7.d + __ sve_neg(z20, __ D, p4, z6); // neg z20.d, p4/m, z6.d + __ sve_not(z13, __ H, p0, z29); // not z13.h, p0/m, z29.h + __ sve_orr(z9, __ B, p0, z1); // orr z9.b, p0/m, z9.b, z1.b + __ sve_rbit(z27, __ B, p6, z15); // rbit z27.b, p6/m, z15.b + __ sve_revb(z4, __ D, p7, z17); // revb z4.d, p7/m, z17.d + __ sve_smax(z2, __ B, p0, z24); // smax z2.b, p0/m, z2.b, z24.b + __ sve_smin(z26, __ B, p7, z13); // smin z26.b, p7/m, z26.b, z13.b + __ sve_umax(z22, __ D, p3, z16); // umax z22.d, p3/m, z22.d, z16.d + __ sve_umin(z17, __ D, p1, z11); // umin z17.d, p1/m, z17.d, z11.d + __ sve_sub(z16, __ B, p0, z16); // sub z16.b, p0/m, z16.b, z16.b + __ sve_fabs(z28, __ D, p1, z23); // fabs z28.d, p1/m, z23.d + __ sve_fadd(z28, __ D, p4, z10); // fadd z28.d, p4/m, z28.d, z10.d + __ sve_fdiv(z17, __ D, p7, z7); // fdiv z17.d, p7/m, z17.d, z7.d + __ sve_fmax(z4, __ S, p3, z24); // fmax z4.s, p3/m, z4.s, z24.s + __ sve_fmin(z9, __ S, p2, z11); // fmin z9.s, p2/m, z9.s, z11.s + __ sve_fmul(z4, __ D, p5, z22); // fmul z4.d, p5/m, z4.d, z22.d + __ sve_fneg(z4, __ S, p0, z15); // fneg z4.s, p0/m, z15.s + __ sve_frintm(z4, __ D, p7, z26); // frintm z4.d, p7/m, z26.d + __ sve_frintn(z5, __ S, p5, z26); // frintn z5.s, p5/m, z26.s + __ sve_frintp(z31, __ S, p0, z25); // frintp z31.s, p0/m, z25.s + __ sve_fsqrt(z8, __ D, p1, z3); // fsqrt z8.d, p1/m, z3.d + __ sve_fsub(z7, __ D, p6, z24); // fsub z7.d, p6/m, z7.d, z24.d + __ sve_fmad(z24, __ S, p7, z17, z1); // fmad z24.s, p7/m, z17.s, z1.s + __ sve_fmla(z12, __ D, p7, z13, z8); // fmla z12.d, p7/m, z13.d, z8.d + __ sve_fmls(z29, __ D, p0, z31, z23); // fmls z29.d, p0/m, z31.d, z23.d + __ sve_fmsb(z20, __ D, p0, z21, z7); // fmsb z20.d, p0/m, z21.d, z7.d + __ sve_fnmad(z29, __ D, p6, z22, z8); // fnmad z29.d, p6/m, z22.d, z8.d + __ sve_fnmsb(z26, __ D, p5, z5, z6); // fnmsb z26.d, p5/m, z5.d, z6.d + __ sve_fnmla(z18, __ S, p3, z26, z21); // fnmla z18.s, p3/m, z26.s, z21.s + __ sve_fnmls(z0, __ S, p4, z10, z28); // fnmls z0.s, p4/m, z10.s, z28.s + __ sve_mla(z17, __ D, p1, z30, z20); // mla z17.d, p1/m, z30.d, z20.d + __ sve_mls(z28, __ S, p3, z17, z14); // mls z28.s, p3/m, z17.s, z14.s + __ sve_and(z10, z26, z11); // and z10.d, z26.d, z11.d + __ sve_eor(z0, z11, z15); // eor z0.d, z11.d, z15.d + __ sve_orr(z23, z23, z20); // orr z23.d, z23.d, z20.d + __ sve_bic(z23, z20, z29); // bic z23.d, z20.d, z29.d + __ sve_uzp1(z0, __ S, z27, z6); // uzp1 z0.s, z27.s, z6.s + __ sve_uzp2(z13, __ H, z12, z4); // uzp2 z13.h, z12.h, z4.h + __ sve_fabd(z31, __ D, p6, z23); // fabd z31.d, p6/m, z31.d, z23.d + __ sve_bext(z6, __ D, z2, z29); // bext z6.d, z2.d, z29.d + __ sve_bdep(z0, __ B, z29, z23); // bdep z0.b, z29.b, z23.b + __ sve_eor3(z4, z5, z8); // eor3 z4.d, z4.d, z5.d, z8.d + __ sve_sqadd(z13, __ H, p4, z13); // sqadd z13.h, p4/m, z13.h, z13.h + __ sve_sqsub(z8, __ H, p2, z8); // sqsub z8.h, p2/m, z8.h, z8.h + __ sve_uqadd(z19, __ S, p0, z29); // uqadd z19.s, p0/m, z19.s, z29.s + __ sve_uqsub(z16, __ D, p3, z23); // uqsub z16.d, p3/m, z16.d, z23.d // SVEReductionOp - __ sve_andv(v23, __ S, p5, z28); // andv s23, p5, z28.s - __ sve_orv(v20, __ B, p7, z24); // orv b20, p7, z24.b - __ sve_eorv(v27, __ H, p1, z23); // eorv h27, p1, z23.h - __ sve_smaxv(v12, __ D, p1, z13); // smaxv d12, p1, z13.d - __ sve_sminv(v26, __ B, p5, z20); // sminv b26, p5, z20.b - __ sve_fminv(v2, __ S, p7, z29); // fminv s2, p7, z29.s - __ sve_fmaxv(v29, __ S, p5, z3); // fmaxv s29, p5, z3.s - __ sve_fadda(v5, __ S, p2, z28); // fadda s5, p2, s5, z28.s - __ sve_uaddv(v17, __ H, p3, z14); // uaddv d17, p3, z14.h + __ sve_andv(v23, __ B, p7, z13); // andv b23, p7, z13.b + __ sve_orv(v25, __ H, p5, z0); // orv h25, p5, z0.h + __ sve_eorv(v25, __ H, p7, z11); // eorv h25, p7, z11.h + __ sve_smaxv(v14, __ H, p5, z22); // smaxv h14, p5, z22.h + __ sve_sminv(v5, __ H, p4, z0); // sminv h5, p4, z0.h + __ sve_fminv(v9, __ D, p0, z3); // fminv d9, p0, z3.d + __ sve_fmaxv(v14, __ S, p1, z29); // fmaxv s14, p1, z29.s + __ sve_fadda(v14, __ D, p5, z4); // fadda d14, p5, d14, z4.d + __ sve_uaddv(v27, __ S, p3, z22); // uaddv d27, p3, z22.s // AddWideNEONOp - __ saddwv(v10, v11, __ T8H, v12, __ T8B); // saddw v10.8H, v11.8H, v12.8B - __ saddwv2(v8, v9, __ T8H, v10, __ T16B); // saddw2 v8.8H, v9.8H, v10.16B - __ saddwv(v9, v10, __ T4S, v11, __ T4H); // saddw v9.4S, v10.4S, v11.4H - __ saddwv2(v19, v20, __ T4S, v21, __ T8H); // saddw2 v19.4S, v20.4S, v21.8H - __ saddwv(v0, v1, __ T2D, v2, __ T2S); // saddw v0.2D, v1.2D, v2.2S - __ saddwv2(v29, v30, __ T2D, v31, __ T4S); // saddw2 v29.2D, v30.2D, v31.4S - __ uaddwv(v16, v17, __ T8H, v18, __ T8B); // uaddw v16.8H, v17.8H, v18.8B - __ uaddwv2(v16, v17, __ T8H, v18, __ T16B); // uaddw2 v16.8H, v17.8H, v18.16B - __ uaddwv(v13, v14, __ T4S, v15, __ T4H); // uaddw v13.4S, v14.4S, v15.4H - __ uaddwv2(v23, v24, __ T4S, v25, __ T8H); // uaddw2 v23.4S, v24.4S, v25.8H - __ uaddwv(v24, v25, __ T2D, v26, __ T2S); // uaddw v24.2D, v25.2D, v26.2S - __ uaddwv2(v23, v24, __ T2D, v25, __ T4S); // uaddw2 v23.2D, v24.2D, v25.4S + __ saddwv(v31, v0, __ T8H, v1, __ T8B); // saddw v31.8H, v0.8H, v1.8B + __ saddwv2(v24, v25, __ T8H, v26, __ T16B); // saddw2 v24.8H, v25.8H, v26.16B + __ saddwv(v11, v12, __ T4S, v13, __ T4H); // saddw v11.4S, v12.4S, v13.4H + __ saddwv2(v16, v17, __ T4S, v18, __ T8H); // saddw2 v16.4S, v17.4S, v18.8H + __ saddwv(v12, v13, __ T2D, v14, __ T2S); // saddw v12.2D, v13.2D, v14.2S + __ saddwv2(v17, v18, __ T2D, v19, __ T4S); // saddw2 v17.2D, v18.2D, v19.4S + __ uaddwv(v28, v29, __ T8H, v30, __ T8B); // uaddw v28.8H, v29.8H, v30.8B + __ uaddwv2(v3, v4, __ T8H, v5, __ T16B); // uaddw2 v3.8H, v4.8H, v5.16B + __ uaddwv(v28, v29, __ T4S, v30, __ T4H); // uaddw v28.4S, v29.4S, v30.4H + __ uaddwv2(v16, v17, __ T4S, v18, __ T8H); // uaddw2 v16.4S, v17.4S, v18.8H + __ uaddwv(v4, v5, __ T2D, v6, __ T2S); // uaddw v4.2D, v5.2D, v6.2S + __ uaddwv2(v29, v30, __ T2D, v31, __ T4S); // uaddw2 v29.2D, v30.2D, v31.4S __ bind(forth); @@ -1387,30 +1403,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x1400047d, 0x94000000, - 0x97ffffd4, 0x9400047a, 0x3400000a, 0x34fffa2a, - 0x34008eea, 0x35000008, 0x35fff9c8, 0x35008e88, - 0xb400000b, 0xb4fff96b, 0xb4008e2b, 0xb500001d, - 0xb5fff91d, 0xb5008ddd, 0x10000013, 0x10fff8b3, - 0x10008d73, 0x90000013, 0x36300016, 0x3637f836, - 0x36308cf6, 0x3758000c, 0x375ff7cc, 0x37588c8c, + 0x14000000, 0x17ffffd7, 0x1400048d, 0x94000000, + 0x97ffffd4, 0x9400048a, 0x3400000a, 0x34fffa2a, + 0x340090ea, 0x35000008, 0x35fff9c8, 0x35009088, + 0xb400000b, 0xb4fff96b, 0xb400902b, 0xb500001d, + 0xb5fff91d, 0xb5008fdd, 0x10000013, 0x10fff8b3, + 0x10008f73, 0x90000013, 0x36300016, 0x3637f836, + 0x36308ef6, 0x3758000c, 0x375ff7cc, 0x37588e8c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x54008a60, 0x54000001, 0x54fff541, 0x54008a01, - 0x54000002, 0x54fff4e2, 0x540089a2, 0x54000002, - 0x54fff482, 0x54008942, 0x54000003, 0x54fff423, - 0x540088e3, 0x54000003, 0x54fff3c3, 0x54008883, - 0x54000004, 0x54fff364, 0x54008824, 0x54000005, - 0x54fff305, 0x540087c5, 0x54000006, 0x54fff2a6, - 0x54008766, 0x54000007, 0x54fff247, 0x54008707, - 0x54000008, 0x54fff1e8, 0x540086a8, 0x54000009, - 0x54fff189, 0x54008649, 0x5400000a, 0x54fff12a, - 0x540085ea, 0x5400000b, 0x54fff0cb, 0x5400858b, - 0x5400000c, 0x54fff06c, 0x5400852c, 0x5400000d, - 0x54fff00d, 0x540084cd, 0x5400000e, 0x54ffefae, - 0x5400846e, 0x5400000f, 0x54ffef4f, 0x5400840f, + 0x54008c60, 0x54000001, 0x54fff541, 0x54008c01, + 0x54000002, 0x54fff4e2, 0x54008ba2, 0x54000002, + 0x54fff482, 0x54008b42, 0x54000003, 0x54fff423, + 0x54008ae3, 0x54000003, 0x54fff3c3, 0x54008a83, + 0x54000004, 0x54fff364, 0x54008a24, 0x54000005, + 0x54fff305, 0x540089c5, 0x54000006, 0x54fff2a6, + 0x54008966, 0x54000007, 0x54fff247, 0x54008907, + 0x54000008, 0x54fff1e8, 0x540088a8, 0x54000009, + 0x54fff189, 0x54008849, 0x5400000a, 0x54fff12a, + 0x540087ea, 0x5400000b, 0x54fff0cb, 0x5400878b, + 0x5400000c, 0x54fff06c, 0x5400872c, 0x5400000d, + 0x54fff00d, 0x540086cd, 0x5400000e, 0x54ffefae, + 0x5400866e, 0x5400000f, 0x54ffef4f, 0x5400860f, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1472,98 +1488,102 @@ 0x9ad521f7, 0x9adb263c, 0x9ac0286a, 0x9ac92f27, 0x9bdd7de6, 0x9b427d4f, 0x1b0b2cf1, 0x1b1ddcf7, 0x9b0b2f6e, 0x9b0cbf04, 0x9b2b728e, 0x9b2cdd6d, - 0x9bae275e, 0x9ba7954d, 0x7ea3d5fe, 0x1e30098c, - 0x1e321bff, 0x1e302ab3, 0x1e35394f, 0x7efcd542, - 0x1e7f0bc7, 0x1e621832, 0x1e632946, 0x1e673979, - 0x1f000d81, 0x1f06dfb3, 0x1f3c6c06, 0x1f2774a2, - 0x1f4d332c, 0x1f48ca78, 0x1f755356, 0x1f7e5853, - 0x1e2042c8, 0x1e20c2b3, 0x1e21424c, 0x1e21c0d5, - 0x1e22c070, 0x1e23c3a3, 0x1ee24383, 0x1e6041cf, - 0x1e60c1aa, 0x1e61424c, 0x1e61c34a, 0x1e6240e7, - 0x1e3803ae, 0x9e3802e0, 0x1e780180, 0x9e7801d7, - 0x1e2200ed, 0x9e2200ef, 0x1e620289, 0x9e620393, - 0x1e24021e, 0x9e640122, 0x1e3002b0, 0x9e70009d, - 0x1e260361, 0x9e660318, 0x1e2702ae, 0x9e6700ad, - 0x1e392180, 0x1e7e2320, 0x1e202388, 0x1e6022a8, - 0x293a1796, 0x29426e73, 0x697c68fc, 0xa93d0486, - 0xa97b5eba, 0x29b47934, 0x29c2534d, 0x69f62dbd, - 0xa9bd54bb, 0xa9c503c6, 0x28a63e13, 0x28e25d2c, - 0x68c469e0, 0xa8b34748, 0xa8f51c59, 0x28264433, - 0x285036c0, 0xa8005f7d, 0xa872290b, 0x0c407160, - 0x4cdfa350, 0x0cd16f56, 0x4cdf27bb, 0x0d40c0d6, - 0x4ddfcbae, 0x0dd0cd96, 0x4c408c01, 0x0cdf86aa, - 0x4d60c327, 0x0dffc929, 0x4deecd89, 0x4cd14887, - 0x0c404a37, 0x4d40e6c4, 0x4ddfe84d, 0x0dcced50, - 0x4cdf0444, 0x0ccb0286, 0x0d60e18c, 0x0dffe630, - 0x0df0eb2e, 0x0e31bab4, 0x4e31b841, 0x0e71bb17, - 0x4e71bbfe, 0x4eb1b9ee, 0x0e30a862, 0x4e30a8e6, - 0x0e70a883, 0x4e70a928, 0x4eb0ab59, 0x6e30f820, - 0x0e31ab9b, 0x2e31abfe, 0x4e31a8c5, 0x6e31a8c5, - 0x0e71abfe, 0x2e71a98b, 0x4e71ab59, 0x6e71a820, - 0x4eb1a81f, 0x6eb1a820, 0x6eb0fa93, 0x7e30fbdd, - 0x7e70fb7a, 0x7eb0f949, 0x7ef0fb7a, 0x0ea0c9ac, - 0x4ea0ca0f, 0x4ee0c98b, 0x2ea0c98b, 0x6ea0ca72, - 0x6ee0cb59, 0x0ea0daf6, 0x4ea0db38, 0x4ee0d820, - 0x0ea0ea51, 0x4ea0e98b, 0x4ee0e8e6, 0x2ea0dbdd, - 0x6ea0d8e6, 0x6ee0d8c5, 0x0e20b8c5, 0x4e20bad5, - 0x0e60ba93, 0x4e60ba30, 0x0ea0ba72, 0x4ea0bbfe, - 0x4ee0bb9b, 0x0ea0fbbc, 0x4ea0f841, 0x4ee0fbbc, - 0x2ea0f841, 0x6ea0fab4, 0x6ee0fbdd, 0x2ea1fa30, - 0x6ea1f9cd, 0x6ee1f96a, 0x2e205bdd, 0x6e205bdd, - 0x0e351e93, 0x4e381ef6, 0x0eac1d6a, 0x4ea61ca4, - 0x2e211c1f, 0x6e371ed5, 0x0e2a8528, 0x4e21841f, - 0x0e758693, 0x4e6c856a, 0x0ebe87bc, 0x4ea48462, - 0x4efb8759, 0x0e270cc5, 0x4e250c83, 0x0e6a0d28, - 0x4e780ef6, 0x0eb50e93, 0x4eaf0dcd, 0x4ee70cc5, - 0x2e3f0fdd, 0x6e3a0f38, 0x2e770ed5, 0x6e7c0f7a, - 0x2eba0f38, 0x6ea50c83, 0x6efa0f38, 0x0e3cd77a, - 0x4e39d717, 0x4e71d60f, 0x2e3786d5, 0x6e258483, - 0x2e7a8738, 0x6e6a8528, 0x2ebb8759, 0x6eb686b4, - 0x6ef28630, 0x0e332e51, 0x4e242c62, 0x0e632c41, - 0x4e622c20, 0x0eba2f38, 0x4ea62ca4, 0x4ee52c83, - 0x2e2e2dac, 0x6e212c1f, 0x2e7e2fbc, 0x6e6c2d6a, - 0x2ebc2f7a, 0x6ea42c62, 0x6eee2dac, 0x0eb4d672, - 0x4ea1d41f, 0x4ee3d441, 0x0e2f9dcd, 0x4e3f9fdd, - 0x0e629c20, 0x4e759e93, 0x0eae9dac, 0x4eb39e51, - 0x2eb8d6f6, 0x6eafd5cd, 0x6efed7bc, 0x2e20d7fe, - 0x6e21d41f, 0x6e63d441, 0x2e3cdf7a, 0x6e3edfbc, - 0x6e66dca4, 0x0e6097fe, 0x4e6694a4, 0x0ea894e6, - 0x4ea097fe, 0x0e3ccf7a, 0x4e34ce72, 0x4e6bcd49, - 0x2e6a9528, 0x6e6e95ac, 0x2ea29420, 0x6eb696b4, - 0x0ea3cc41, 0x4ebacf38, 0x4ee4cc62, 0x2e22fc20, - 0x6e2bfd49, 0x6e7aff38, 0x0e3c677a, 0x4e326630, - 0x0e6067fe, 0x4e656483, 0x0eac656a, 0x4eb96717, - 0x2e2c656a, 0x6e2664a4, 0x2e746672, 0x6e646462, - 0x2ead658b, 0x6eaa6528, 0x0e2ca56a, 0x4e31a60f, - 0x0e73a651, 0x4e64a462, 0x0eaca56a, 0x4eaea5ac, - 0x0e2ef5ac, 0x4e31f60f, 0x4e6ff5cd, 0x0e246c62, - 0x4e296d07, 0x0e766eb4, 0x4e7c6f7a, 0x0eb26e30, - 0x4ea66ca4, 0x2e246c62, 0x6e266ca4, 0x2e6e6dac, - 0x6e746e72, 0x2eb76ed5, 0x6eb26e30, 0x0e34ae72, - 0x4e2dad8b, 0x0e77aed5, 0x4e79af17, 0x0eaeadac, - 0x4ebcaf7a, 0x0e79b717, 0x4e7eb7bc, 0x0eb0b5ee, - 0x4eadb58b, 0x0e3a2738, 0x4e232441, 0x0e6e25ac, - 0x4e61241f, 0x0eac256a, 0x4eb22630, 0x0ea9f507, - 0x4ea4f462, 0x4ee5f483, 0x2eafedcd, 0x6eb5ee93, - 0x6ef3ee51, 0x0fa31041, 0x4f8780c5, 0x4fc41862, - 0x0f895107, 0x4fa1880f, 0x4fcc516a, 0x2f8c916a, - 0x4f9089ee, 0x6fcf99cd, 0x0f748062, 0x4f4d818b, - 0x0fa1800f, 0x4f8880e6, 0x0e2b3549, 0x4e3e37bc, - 0x0e71360f, 0x4e7f37dd, 0x0eb836f6, 0x4ea1341f, - 0x4ef53693, 0x0e213c1f, 0x4e273cc5, 0x0e703dee, - 0x4e743e72, 0x0ea13c1f, 0x4eb43e72, 0x4efd3f9b, - 0x2e368eb4, 0x6e328e30, 0x2e6e8dac, 0x6e6d8d8b, - 0x2eab8d49, 0x6ea88ce6, 0x6ee08ffe, 0x2e333651, - 0x6e3d379b, 0x2e7e37bc, 0x6e6037fe, 0x2ea93507, - 0x6eac356a, 0x6ef636b4, 0x2e2c3d6a, 0x6e263ca4, - 0x2e7a3f38, 0x6e733e51, 0x2eb33e51, 0x6eb83ef6, - 0x6ee53c83, 0x0e3fe7dd, 0x4e31e60f, 0x4e78e6f6, - 0x2eb5e693, 0x6eb5e693, 0x6ef8e6f6, 0x2e24e462, - 0x6e31e60f, 0x6e68e4e6, 0x65922e06, 0x65d0303b, - 0x65903222, 0x659135ab, 0x65913b7e, 0x65d33821, - 0x254b9e29, 0x258f14f7, 0x25c2184c, 0x258222e5, - 0x25d23730, 0x250e9d99, 0x24e9460e, 0x2427465e, - 0x24a2a937, 0x24fbe6ae, 0xba5fd3e3, 0x3a5f03e5, + 0x9bae275e, 0x9ba7954d, 0x7ec315fe, 0x1ef0098c, + 0x1ef21bff, 0x1ef02ab3, 0x1ef5394f, 0x1efc4942, + 0x1eff5bc7, 0x1ee28832, 0x7ea3d546, 0x1e270979, + 0x1e201981, 0x1e3d2a63, 0x1e263ae6, 0x1e3b4b80, + 0x1e2758a2, 0x1e39899d, 0x7ef8d58d, 0x1e720913, + 0x1e751b56, 0x1e622a74, 0x1e683ade, 0x1e754a76, + 0x1e755a4c, 0x1e638a06, 0x1fc373a3, 0x1f0a35cf, + 0x1f0aea4c, 0x1f2f74e7, 0x1f2032e0, 0x1f4d21d8, + 0x1f49d0ef, 0x1f7f43b3, 0x1f705522, 0x1e20409e, + 0x1e20c361, 0x1e214319, 0x1e21c2ae, 0x1e22c0cd, + 0x1e23c32c, 0x1ee243d9, 0x1e6042bc, 0x1e60c2f0, + 0x1e6143a5, 0x1e61c276, 0x1e62428d, 0x1ee1c393, + 0x1e3800d1, 0x9e3800ed, 0x1e78035c, 0x9e7800d1, + 0x1e220081, 0x9e22028e, 0x1e6202a7, 0x9e6202fb, + 0x1e24028d, 0x9e64039e, 0x1e3002aa, 0x9e700225, + 0x1e2601cb, 0x9e6602ad, 0x1e2701db, 0x9e6702e4, + 0x1e3e2300, 0x1e6e2180, 0x1e202228, 0x1e602388, + 0x29021b40, 0x297c78c0, 0x69660970, 0xa908018f, + 0xa9427ae7, 0x29a03cfa, 0x29fc3d4b, 0x69c84033, + 0xa988240e, 0xa9fa0d9b, 0x28a02d88, 0x28c8408a, + 0x68f87a6a, 0xa8ba09f8, 0xa8c52a18, 0x280257be, + 0x28727948, 0xa83868de, 0xa8440a98, 0x0c40733f, + 0x4cdfa1e5, 0x0ccd6cea, 0x4cdf260d, 0x0d40c227, + 0x4ddfcb30, 0x0dc7cc6b, 0x4c408ced, 0x0cdf8769, + 0x4d60c346, 0x0dffca17, 0x4de8cda6, 0x4cda4834, + 0x0c4049ef, 0x4d40e6dd, 0x4ddfe946, 0x0dcfeccf, + 0x4cdf0546, 0x0cc7006b, 0x0d60e32c, 0x0dffe5eb, + 0x0dfce8de, 0x0e31bb9b, 0x4e31bbbc, 0x0e71b841, + 0x4e71bbbc, 0x4eb1b841, 0x0e30aab4, 0x4e30abdd, + 0x0e70aa30, 0x4e70a9cd, 0x4eb0a96a, 0x6e30fbdd, + 0x0e31abdd, 0x2e31aa93, 0x4e31aaf6, 0x6e31a96a, + 0x0e71a8a4, 0x2e71a81f, 0x4e71aad5, 0x6e71a928, + 0x4eb1a81f, 0x6eb1aa93, 0x6eb0f96a, 0x7e30fbbc, + 0x7e70f862, 0x7eb0fb59, 0x7ef0f8c5, 0x0ea0c883, + 0x4ea0c928, 0x4ee0caf6, 0x2ea0ca93, 0x6ea0c9cd, + 0x6ee0c8c5, 0x0ea0dbdd, 0x4ea0db38, 0x4ee0dad5, + 0x0ea0eb7a, 0x4ea0eb38, 0x4ee0e883, 0x2ea0db38, + 0x6ea0db7a, 0x6ee0db17, 0x0e20ba0f, 0x4e20bad5, + 0x0e60b883, 0x4e60bb38, 0x0ea0b928, 0x4ea0bb59, + 0x4ee0bab4, 0x0ea0fa30, 0x4ea0fa51, 0x4ee0f862, + 0x2ea0f841, 0x6ea0f820, 0x6ee0fb38, 0x2ea1f8a4, + 0x6ea1f883, 0x6ee1f9ac, 0x2e20581f, 0x6e205bbc, + 0x0e2c1d6a, 0x4e3c1f7a, 0x0ea41c62, 0x4eae1dac, + 0x2e341e72, 0x6e211c1f, 0x0e238441, 0x4e2f85cd, + 0x0e7f87dd, 0x4e628420, 0x0eb58693, 0x4eae85ac, + 0x4ef38651, 0x0e380ef6, 0x4e2f0dcd, 0x0e7e0fbc, + 0x4e600ffe, 0x0ea10c1f, 0x4ea30c41, 0x4efc0f7a, + 0x2e3e0fbc, 0x6e260ca4, 0x2e600ffe, 0x6e660ca4, + 0x2ea80ce6, 0x6ea00ffe, 0x6efc0f7a, 0x0e34d672, + 0x4e2bd549, 0x4e6ad528, 0x2e2e85ac, 0x6e228420, + 0x2e7686b4, 0x6e638441, 0x2eba8738, 0x6ea48462, + 0x6ee28420, 0x0e2b2d49, 0x4e3a2f38, 0x0e7c2f7a, + 0x4e722e30, 0x0ea02ffe, 0x4ea52c83, 0x4eec2d6a, + 0x2e392f17, 0x6e2c2d6a, 0x2e662ca4, 0x6e742e72, + 0x2ea42c62, 0x6ead2d8b, 0x6eea2d28, 0x0eacd56a, + 0x4eb1d60f, 0x4ef3d651, 0x0e249c62, 0x4e2c9d6a, + 0x0e6e9dac, 0x4e6e9dac, 0x0eb19e0f, 0x4eaf9dcd, + 0x2ea4d462, 0x6ea9d507, 0x6ef6d6b4, 0x2e3cd77a, + 0x6e32d630, 0x6e66d4a4, 0x2e24dc62, 0x6e26dca4, + 0x6e6eddac, 0x0e749672, 0x4e7796d5, 0x0eb29630, + 0x4eb49672, 0x0e2dcd8b, 0x4e37ced5, 0x4e79cf17, + 0x2e6e95ac, 0x6e7c977a, 0x2eb99717, 0x6ebe97bc, + 0x0eb0cdee, 0x4eadcd8b, 0x4efacf38, 0x2e23fc41, + 0x6e2efdac, 0x6e61fc1f, 0x0e2c656a, 0x4e326630, + 0x0e696507, 0x4e646462, 0x0ea56483, 0x4eaf65cd, + 0x2e356693, 0x6e336651, 0x2e726630, 0x6e656483, + 0x2ea36441, 0x6ead658b, 0x0e20a7fe, 0x4e27a4c5, + 0x0e6aa528, 0x4e71a60f, 0x0ebfa7dd, 0x4ea0a7fe, + 0x0e22f420, 0x4e36f6b4, 0x4e69f507, 0x0e366eb4, + 0x4e396f17, 0x0e7e6fbc, 0x4e776ed5, 0x0ebd6f9b, + 0x4ebb6f59, 0x2e276cc5, 0x6e236c41, 0x2e796f17, + 0x6e726e30, 0x2ea16c1f, 0x6ea76cc5, 0x0e2eadac, + 0x4e2bad49, 0x0e7eafbc, 0x4e71ae0f, 0x0ebfafdd, + 0x4eb8aef6, 0x0e61b41f, 0x4e75b693, 0x0ea1b41f, + 0x4ea7b4c5, 0x0e3025ee, 0x4e342672, 0x0e61241f, + 0x4e742672, 0x0ebd279b, 0x4eb626b4, 0x0eb2f630, + 0x4eaef5ac, 0x4eedf58b, 0x2eabed49, 0x6ea8ece6, + 0x6ee0effe, 0x0faf11cd, 0x4fa1880f, 0x4fc710c5, + 0x0fa750c5, 0x4f8e81ac, 0x4fca5928, 0x2fa39041, + 0x4fa98907, 0x6fcb9949, 0x0f6d818b, 0x4f498107, + 0x0f8880e6, 0x4f8788c5, 0x0e2f35cd, 0x4e393717, + 0x0e633441, 0x4e6037fe, 0x0eb53693, 0x4ea734c5, + 0x4ef33651, 0x0e243c62, 0x4e323e30, 0x0e783ef6, + 0x4e6f3dcd, 0x0eac3d6a, 0x4eb73ed5, 0x4eff3fdd, + 0x2e3d8f9b, 0x6e2e8dac, 0x2e7d8f9b, 0x6e658c83, + 0x2ea38c41, 0x6ea18c1f, 0x6efa8f38, 0x2e353693, + 0x6e333651, 0x2e6b3549, 0x6e7e37bc, 0x2ebd379b, + 0x6eb1360f, 0x6ee93507, 0x2e373ed5, 0x6e393f17, + 0x2e613c1f, 0x6e7b3f59, 0x2ea43c62, 0x6ea13c1f, + 0x6efd3f9b, 0x0e34e672, 0x4e2ce56a, 0x4e79e717, + 0x2eb5e693, 0x6ea5e483, 0x6ef4e672, 0x2e22e420, + 0x6e3be759, 0x6e7ce77a, 0x65d22c4b, 0x65d03f92, + 0x65902b68, 0x6591264e, 0x659135f3, 0x65d33444, + 0x25c180af, 0x25cc0897, 0x25c71b6b, 0x25103080, + 0x251729f2, 0x25c48552, 0x24288aab, 0x242213f8, + 0x24fb63d6, 0x247cafab, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, 0x7a42cbe2, 0x93df03ff, 0xc820ffff, 0x8822fc7f, 0xc8247cbf, 0x88267fff, 0x4e010fe0, 0x5e040420, 0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, @@ -1625,55 +1645,55 @@ 0x1e721000, 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, - 0x1e7e1000, 0x1e7e3000, 0xf82081f1, 0xf824011a, - 0xf83c1376, 0xf83b22f9, 0xf82030c4, 0xf8305080, - 0xf82f4141, 0xf8277145, 0xf83c6287, 0xf8b780d5, - 0xf8ab0228, 0xf8bf1226, 0xf8a223cc, 0xf8bd3363, - 0xf8b651dd, 0xf8ad423c, 0xf8b87045, 0xf8ae620a, - 0xf8eb82fb, 0xf8ec02c4, 0xf8f11024, 0xf8f321f0, - 0xf8ed318e, 0xf8e25071, 0xf8f540b7, 0xf8e67267, - 0xf8ed623c, 0xf8708046, 0xf87d0083, 0xf8661290, - 0xf86d228c, 0xf8683299, 0xf8735160, 0xf8784286, - 0xf87f720e, 0xf86660e0, 0xb82f8353, 0xb82902ea, - 0xb8351396, 0xb82221e3, 0xb83330f4, 0xb82450fd, - 0xb8204209, 0xb8347097, 0xb83062ea, 0xb8ab80d9, - 0xb8bf01b0, 0xb8b7102c, 0xb8ae22a9, 0xb8b031fa, - 0xb8a451e4, 0xb8a843c6, 0xb8a4723d, 0xb8bd613a, - 0xb8ef8162, 0xb8fd00e3, 0xb8e112bb, 0xb8f0210e, - 0xb8f03336, 0xb8e552b4, 0xb8f04217, 0xb8fe7294, - 0xb8e06264, 0xb8788284, 0xb8640358, 0xb8731102, - 0xb868230e, 0xb87032df, 0xb864503f, 0xb86a4194, - 0xb86070e9, 0xb8786090, 0xce2a6cdb, 0xce107db8, - 0xce748ed6, 0xce8973bf, 0xce7480f4, 0xce6b853c, - 0xcec0818e, 0xce788834, 0x25a0cd89, 0x25a1d093, - 0x05803685, 0x05400c08, 0x050074c4, 0x2560d6a0, - 0x2521c0fb, 0x05805089, 0x05403e98, 0x05025238, - 0x25e0cd0b, 0x25e1d1d2, 0x05800e4e, 0x05402676, - 0x05001e63, 0x25a0d1c9, 0x2521c495, 0x0583abe2, - 0x054011ab, 0x05007cbe, 0x2560c3b7, 0x25e1c358, - 0x05806593, 0x054064b5, 0x05000e5a, 0x2520c3f1, - 0x25a1cc29, 0x05801468, 0x05401d71, 0x05035bb2, - 0x04bb01f0, 0x046806dc, 0x659c0385, 0x65d909e0, - 0x65c30415, 0x04fa10ba, 0x04611a33, 0x042e17ce, - 0x04bf1c52, 0x0456b7d7, 0x04400008, 0x049a1417, - 0x04509b1a, 0x041b1456, 0x0499b58b, 0x04dab938, - 0x04991691, 0x04d395a4, 0x04d19ff6, 0x045011f2, - 0x0417be8d, 0x041eadc1, 0x04980987, 0x052799e4, - 0x05a49c23, 0x04c817e5, 0x044a0d2d, 0x04c901fe, - 0x044b0343, 0x04c10839, 0x04dcac2a, 0x65c087ba, - 0x658d8791, 0x65869d61, 0x65c78021, 0x65828c5b, - 0x049db33e, 0x65c2b862, 0x65c0ac7d, 0x65c1b38e, - 0x65cdab64, 0x65c19022, 0x65fc97e7, 0x65bd162a, - 0x65b82596, 0x65a0a969, 0x65a4d697, 0x65feec8f, - 0x65ba46bb, 0x65a4633f, 0x04c742a6, 0x049f7f18, - 0x042c3141, 0x04b9310d, 0x047733e1, 0x04f53014, - 0x05bb6bbf, 0x05ba6fa8, 0x65c88645, 0x4555b34d, - 0x45cab660, 0x043138c7, 0x44589b94, 0x445a8e71, - 0x44198b1a, 0x449b8f8b, 0x049a3797, 0x04183f14, - 0x045926fb, 0x04c825ac, 0x040a369a, 0x65873fa2, - 0x6586347d, 0x65982b85, 0x04412dd1, 0x0e2c116a, - 0x4e2a1128, 0x0e6b1149, 0x4e751293, 0x0ea21020, - 0x4ebf13dd, 0x2e321230, 0x6e321230, 0x2e6f11cd, - 0x6e791317, 0x2eba1338, 0x6eb91317, + 0x1e7e1000, 0x1e7e3000, 0xf83180b8, 0xf822014e, + 0xf830136b, 0xf837208c, 0xf8363091, 0xf8215213, + 0xf83041cd, 0xf82c7222, 0xf82362f5, 0xf8a580e6, + 0xf8b3038d, 0xf8b110d0, 0xf8a2207d, 0xf8a431e6, + 0xf8b4518d, 0xf8b44328, 0xf8b47013, 0xf8ab60d8, + 0xf8f481df, 0xf8f00006, 0xf8e7126f, 0xf8fa2149, + 0xf8f732d5, 0xf8fc5062, 0xf8ef4293, 0xf8e773a4, + 0xf8e76120, 0xf87082f4, 0xf8640150, 0xf877132b, + 0xf866221f, 0xf86d3197, 0xf861512e, 0xf8754350, + 0xf86f7084, 0xf87060c8, 0xb83e83a4, 0xb831035d, + 0xb829104f, 0xb82b207d, 0xb8273361, 0xb83551d0, + 0xb82842d0, 0xb8397285, 0xb83562f0, 0xb8b0829e, + 0xb8b40080, 0xb8b31098, 0xb8b42304, 0xb8ba3053, + 0xb8a851c8, 0xb8b843f0, 0xb8b673e4, 0xb8a1628a, + 0xb8ec8120, 0xb8e701f8, 0xb8e410db, 0xb8ea231b, + 0xb8ed33f0, 0xb8f65296, 0xb8ff413d, 0xb8ee70f4, + 0xb8f4613c, 0xb86b818e, 0xb8740301, 0xb86911b3, + 0xb8732210, 0xb8653060, 0xb86c51e8, 0xb86f4090, + 0xb86f70be, 0xb86062ca, 0xce20247b, 0xce0a63b3, + 0xce678e84, 0xce8eafb8, 0xce6d836b, 0xce7187f2, + 0xcec0806e, 0xce768a1e, 0x2520d474, 0x2521dae3, + 0x05800d33, 0x05403635, 0x05004cb8, 0x2560d175, + 0x2561c35e, 0x05809863, 0x054030f8, 0x05000ed7, + 0x2520c84e, 0x2521d69a, 0x05809892, 0x05408909, + 0x05000d2c, 0x2560d9cb, 0x25e1d352, 0x05806b49, + 0x0542d157, 0x050026a8, 0x2520cf2a, 0x25a1d599, + 0x05801ec0, 0x05422dc5, 0x05000e11, 0x2560c07e, + 0x2521dfb2, 0x0580ab15, 0x0540040c, 0x0500000f, + 0x04fb0353, 0x043606cd, 0x65940161, 0x65980b14, + 0x65d4063f, 0x04751095, 0x04ff1ade, 0x0473165a, + 0x04bd1dab, 0x0456a1c5, 0x04400542, 0x045a0753, + 0x041083c2, 0x04db0694, 0x0459adbd, 0x045abc2e, + 0x04d9007c, 0x04139929, 0x041189da, 0x04d018f4, + 0x04d7b0d4, 0x045ea3ad, 0x04180029, 0x052799fb, + 0x05e49e24, 0x04080302, 0x040a1dba, 0x04c90e16, + 0x04cb0571, 0x04010210, 0x04dca6fc, 0x65c0915c, + 0x65cd9cf1, 0x65868f04, 0x65878969, 0x65c296c4, + 0x049da1e4, 0x65c2bf44, 0x6580b745, 0x6581a33f, + 0x65cda468, 0x65c19b07, 0x65a19e38, 0x65e81dac, + 0x65f723fd, 0x65e7a2b4, 0x65e8dadd, 0x65e6f4ba, + 0x65b54f52, 0x65bc7140, 0x04d447d1, 0x048e6e3c, + 0x042b334a, 0x04af3160, 0x047432f7, 0x04fd3297, + 0x05a66b60, 0x05646d8d, 0x65c89aff, 0x45ddb046, + 0x4517b7a0, 0x04253904, 0x445891ad, 0x445a8908, + 0x449983b3, 0x44db8ef0, 0x041a3db7, 0x04583419, + 0x04593d79, 0x044836ce, 0x044a3005, 0x65c72069, + 0x658627ae, 0x65d8348e, 0x04812edb, 0x0e21101f, + 0x4e3a1338, 0x0e6d118b, 0x4e721230, 0x0eae11ac, + 0x4eb31251, 0x2e3e13bc, 0x6e251083, 0x2e7e13bc, + 0x6e721230, 0x2ea610a4, 0x6ebf13dd, }; // END Generated code -- do not edit diff --git a/test/hotspot/jtreg/compiler/c2/irTests/ConvF2HFIdealizationTests.java b/test/hotspot/jtreg/compiler/c2/irTests/ConvF2HFIdealizationTests.java index d7b927778df2d..6796dc68e5255 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/ConvF2HFIdealizationTests.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/ConvF2HFIdealizationTests.java @@ -55,6 +55,9 @@ public static void main(String[] args) { @IR(counts = {IRNode.REINTERPRET_S2HF, ">=1", IRNode.REINTERPRET_HF2S, ">=1", IRNode.ADD_HF, ">=1" }, failOn = {IRNode.ADD_F, IRNode.CONV_HF2F, IRNode.CONV_F2HF}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.REINTERPRET_S2HF, ">=1", IRNode.REINTERPRET_HF2S, ">=1", IRNode.ADD_HF, ">=1" }, + failOn = {IRNode.ADD_F, IRNode.CONV_HF2F, IRNode.CONV_F2HF}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) // Test pattern - ConvHF2F -> AddF -> ConvF2HF is optimized to ReinterpretS2HF -> AddHF -> ReinterpretHF2S public void test1() { for (int i = 0; i < SIZE; i++) { diff --git a/test/hotspot/jtreg/compiler/c2/irTests/MulHFNodeIdealizationTests.java b/test/hotspot/jtreg/compiler/c2/irTests/MulHFNodeIdealizationTests.java index eab220dc196cd..82f43a9a09778 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/MulHFNodeIdealizationTests.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/MulHFNodeIdealizationTests.java @@ -56,6 +56,10 @@ public MulHFNodeIdealizationTests() { @IR(counts = {IRNode.ADD_HF, "1"}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}, failOn = {IRNode.MUL_HF}) + @IR(counts = {IRNode.ADD_HF, "1"}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}, + failOn = {IRNode.MUL_HF}) + // Test if x * 2 is optimized to x + x public void test1() { dst = multiply(src, valueOf(2.0f)); } diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java b/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java index 014c660dfb2b6..0acefad5d1dd4 100644 --- a/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java +++ b/test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java @@ -43,6 +43,8 @@ public class TestFloat16ScalarOperations { private short[] dst; private short res; + private float[] fl; + private static final Float16 ONE = valueOf(1.0f); private static final Float16 MONE = valueOf(-1.0f); private static final Float16 POSITIVE_ZERO = valueOf(0.0f); @@ -76,8 +78,10 @@ public static void main(String args[]) { public TestFloat16ScalarOperations() { src = new short[count]; dst = new short[count]; + fl = new float[count]; for (int i = 0; i < count; i++) { src[i] = Float.floatToFloat16(r.nextFloat() * MAX_VALUE.floatValue()); + fl[i] = r.nextFloat(); } } @@ -99,14 +103,31 @@ static void assertResult(float actual, float expected, String msg, int iter) { } } + @Test + @IR(counts = {"convF2HFAndS2HF", " >0 "}, phase = {CompilePhase.FINAL_CODE}, + applyIfCPUFeature = {"avx512_fp16", "true"}) + @IR(counts = {"convF2HFAndS2HF", " >0 "}, phase = {CompilePhase.FINAL_CODE}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) + public void testconvF2HFAndS2HF() { + for (int i = 0; i < count; i++) { + // Transform the pattern (S2HF ConvF2HF) in this IR - + // HF2S (AddHF (S2HF (ConvF2HF fl[i])), (S2HF (ConvF2HF fl[i]))) + // to a single convert operation after matching and eliminate redundant moves + dst[i] = float16ToRawShortBits(add(valueOf(fl[i]), valueOf(fl[i]))); + } + } + @Test @IR(counts = {"convHF2SAndHF2F", " >0 "}, phase = {CompilePhase.FINAL_CODE}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {"convHF2SAndHF2F", " >0 "}, phase = {CompilePhase.FINAL_CODE}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testEliminateIntermediateHF2S() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { // Intermediate HF2S + S2HF is eliminated in following transformation // AddHF S2HF(HF2S (AddHF S2HF(src[i]), S2HF(0))), S2HF(src[i]) => AddHF (AddHF S2HF(src[i]), S2HF(0)), S2HF(src[i]) + // Also, the backend optimizes away the extra move while converting res to a float - ConvHF2F (S2HF (AddHF ..)) res = add(add(res, shortBitsToFloat16(src[i])), shortBitsToFloat16(src[i])); dst[i] = (short)res.floatValue(); } @@ -115,6 +136,8 @@ public void testEliminateIntermediateHF2S() { @Test @IR(counts = {IRNode.ADD_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.ADD_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testAdd1() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -126,6 +149,8 @@ public void testAdd1() { @Test @IR(failOn = {IRNode.ADD_HF, IRNode.REINTERPRET_S2HF, IRNode.REINTERPRET_HF2S}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(failOn = {IRNode.ADD_HF, IRNode.REINTERPRET_S2HF, IRNode.REINTERPRET_HF2S}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testAdd2() { Float16 hf0 = shortBitsToFloat16((short)0); Float16 hf1 = shortBitsToFloat16((short)15360); @@ -138,6 +163,8 @@ public void testAdd2() { @Test @IR(counts = {IRNode.SUB_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.SUB_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testSub() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -149,6 +176,8 @@ public void testSub() { @Test @IR(counts = {IRNode.MUL_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MUL_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMul() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -160,6 +189,8 @@ public void testMul() { @Test @IR(counts = {IRNode.DIV_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.DIV_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testDiv() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -171,6 +202,8 @@ public void testDiv() { @Test @IR(counts = {IRNode.DIV_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.DIV_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testDivByOne() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -182,6 +215,8 @@ public void testDivByOne() { @Test @IR(counts = {IRNode.MAX_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MAX_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMax() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -193,6 +228,8 @@ public void testMax() { @Test @IR(counts = {IRNode.MIN_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MIN_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMin() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -204,6 +241,8 @@ public void testMin() { @Test @IR(counts = {IRNode.SQRT_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.SQRT_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testSqrt() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -215,6 +254,8 @@ public void testSqrt() { @Test @IR(counts = {IRNode.FMA_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.FMA_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testFma() { Float16 res = shortBitsToFloat16((short)0); for (int i = 0; i < count; i++) { @@ -227,6 +268,8 @@ public void testFma() { @Test @IR(counts = {IRNode.MUL_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MUL_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testDivByPOT() { Float16 res = valueOf(0.0f); for (int i = 0; i < 50; i++) { @@ -244,6 +287,8 @@ public void testDivByPOT() { @Test @IR(counts = {IRNode.MUL_HF, " 0 ", IRNode.ADD_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MUL_HF, " 0 ", IRNode.ADD_HF, " >0 ", IRNode.REINTERPRET_S2HF, " >0 ", IRNode.REINTERPRET_HF2S, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMulByTWO() { Float16 res = valueOf(0.0f); Float16 multiplier = valueOf(2.0f); @@ -281,6 +326,8 @@ public void testMulByTWO() { @Test @IR(counts = {IRNode.ADD_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.ADD_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testAddConstantFolding() { // If either value is NaN, then the result is NaN. assertResult(add(Float16.NaN, valueOf(2.0f)).floatValue(), Float.NaN, "testAddConstantFolding"); @@ -324,6 +371,8 @@ public void testAddConstantFolding() { @Test @IR(counts = {IRNode.SUB_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.SUB_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testSubConstantFolding() { // If either value is NaN, then the result is NaN. assertResult(subtract(Float16.NaN, valueOf(2.0f)).floatValue(), Float.NaN, "testAddConstantFolding"); @@ -357,6 +406,8 @@ public void testSubConstantFolding() { @Warmup(value = 10000) @IR(counts = {IRNode.MAX_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MAX_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMaxConstantFolding() { // If either value is NaN, then the result is NaN. assertResult(max(valueOf(2.0f), Float16.NaN).floatValue(), Float.NaN, "testMaxConstantFolding"); @@ -375,6 +426,8 @@ public void testMaxConstantFolding() { @Test @IR(counts = {IRNode.MIN_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MIN_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMinConstantFolding() { // If either value is NaN, then the result is NaN. assertResult(min(valueOf(2.0f), Float16.NaN).floatValue(), Float.NaN, "testMinConstantFolding"); @@ -392,6 +445,8 @@ public void testMinConstantFolding() { @Test @IR(counts = {IRNode.DIV_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.DIV_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testDivConstantFolding() { // If either value is NaN, then the result is NaN. assertResult(divide(Float16.NaN, POSITIVE_ZERO).floatValue(), Float.NaN, "testDivConstantFolding"); @@ -432,6 +487,8 @@ public void testDivConstantFolding() { @Test @IR(counts = {IRNode.MUL_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.MUL_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testMulConstantFolding() { // If any operand is NaN, the result is NaN. assertResult(multiply(Float16.NaN, valueOf(4.0f)).floatValue(), Float.NaN, "testMulConstantFolding"); @@ -455,6 +512,8 @@ public void testMulConstantFolding() { @Test @IR(counts = {IRNode.SQRT_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.SQRT_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testSqrtConstantFolding() { // If the argument is NaN or less than zero, then the result is NaN. assertResult(sqrt(Float16.NaN).floatValue(), Float.NaN, "testSqrtConstantFolding"); @@ -474,6 +533,8 @@ public void testSqrtConstantFolding() { @Test @IR(counts = {IRNode.FMA_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.FMA_HF, " 0 ", IRNode.REINTERPRET_S2HF, " 0 ", IRNode.REINTERPRET_HF2S, " 0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testFMAConstantFolding() { // If any argument is NaN, the result is NaN. assertResult(fma(Float16.NaN, valueOf(2.0f), valueOf(3.0f)).floatValue(), Float.NaN, "testFMAConstantFolding"); @@ -509,6 +570,8 @@ public void testFMAConstantFolding() { @Test @IR(failOn = {IRNode.ADD_HF, IRNode.SUB_HF, IRNode.MUL_HF, IRNode.DIV_HF, IRNode.SQRT_HF, IRNode.FMA_HF}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(failOn = {IRNode.ADD_HF, IRNode.SUB_HF, IRNode.MUL_HF, IRNode.DIV_HF, IRNode.SQRT_HF, IRNode.FMA_HF}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testRounding1() { dst[0] = float16ToRawShortBits(add(RANDOM1, RANDOM2)); dst[1] = float16ToRawShortBits(subtract(RANDOM2, RANDOM3)); @@ -548,6 +611,9 @@ public void checkRounding1() { @IR(counts = {IRNode.ADD_HF, " >0 ", IRNode.SUB_HF, " >0 ", IRNode.MUL_HF, " >0 ", IRNode.DIV_HF, " >0 ", IRNode.SQRT_HF, " >0 ", IRNode.FMA_HF, " >0 "}, applyIfCPUFeatureOr = {"avx512_fp16", "true", "zfh", "true"}) + @IR(counts = {IRNode.ADD_HF, " >0 ", IRNode.SUB_HF, " >0 ", IRNode.MUL_HF, " >0 ", + IRNode.DIV_HF, " >0 ", IRNode.SQRT_HF, " >0 ", IRNode.FMA_HF, " >0 "}, + applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) public void testRounding2() { dst[0] = float16ToRawShortBits(add(RANDOM1_VAR, RANDOM2_VAR)); dst[1] = float16ToRawShortBits(subtract(RANDOM2_VAR, RANDOM3_VAR)); diff --git a/test/hotspot/jtreg/compiler/floatingpoint/TestSubNodeFloatDoubleNegation.java b/test/hotspot/jtreg/compiler/floatingpoint/TestSubNodeFloatDoubleNegation.java index 87aeb4489ba4e..4c7092ec65439 100644 --- a/test/hotspot/jtreg/compiler/floatingpoint/TestSubNodeFloatDoubleNegation.java +++ b/test/hotspot/jtreg/compiler/floatingpoint/TestSubNodeFloatDoubleNegation.java @@ -55,9 +55,8 @@ public static void assertResults() { @Test @IR(counts = { IRNode.SUB, "2" }, applyIfPlatform = {"x64", "true"}, applyIfCPUFeature = {"avx512_fp16", "false"}) @IR(counts = { IRNode.SUB_HF, "2" }, applyIfPlatform = {"x64", "true"}, applyIfCPUFeature = {"avx512_fp16", "true"}) - // TODO: uncomment once Float16 support lands in aarch64 with JDK-8345125 - //@IR(counts = { IRNode.SUB, "2" }, applyIfPlatform = {"aarch64", "true"}, applyIfCPUFeatureAnd = {"fphp", "false", "asimdhp", "false"}) - //@IR(counts = { IRNode.SUB_HF, "2" }, applyIfPlatform = {"aarch64", "true"}, applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) + @IR(counts = { IRNode.SUB, "2" }, applyIfPlatform = {"aarch64", "true"}, applyIfCPUFeatureAnd = {"fphp", "false", "asimdhp", "false"}) + @IR(counts = { IRNode.SUB_HF, "2" }, applyIfPlatform = {"aarch64", "true"}, applyIfCPUFeatureAnd = {"fphp", "true", "asimdhp", "true"}) @IR(counts = { IRNode.SUB, "2" }, applyIfPlatform = {"riscv64", "true"}, applyIfCPUFeature = {"zfh", "false"}) @IR(counts = { IRNode.SUB_HF, "2" }, applyIfPlatform = {"riscv64", "true"}, applyIfCPUFeature = {"zfh", "true"}) @IR(counts = { IRNode.SUB, "2" }, applyIfPlatformAnd = {"x64", "false", "aarch64", "false", "riscv64", "false"}) diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java index 7d229cae15201..4ad95ab786f57 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java @@ -109,6 +109,8 @@ public class IREncodingPrinter { "asimd", "sve", "sve2", + "fphp", + "asimdhp", // RISCV64 "rvv", "zbkb", diff --git a/test/hotspot/jtreg/compiler/vectorization/TestFloat16VectorConvChain.java b/test/hotspot/jtreg/compiler/vectorization/TestFloat16VectorConvChain.java index 2cb04c3889b60..f061161e93aeb 100644 --- a/test/hotspot/jtreg/compiler/vectorization/TestFloat16VectorConvChain.java +++ b/test/hotspot/jtreg/compiler/vectorization/TestFloat16VectorConvChain.java @@ -45,6 +45,8 @@ public class TestFloat16VectorConvChain { counts = {IRNode.VECTOR_CAST_HF2F, IRNode.VECTOR_SIZE_ANY, ">= 1", IRNode.VECTOR_CAST_F2HF, IRNode.VECTOR_SIZE_ANY, " >= 1"}) @IR(applyIfCPUFeatureAnd = {"avx512_fp16", "false", "f16c", "true"}, counts = {IRNode.VECTOR_CAST_HF2F, IRNode.VECTOR_SIZE_ANY, ">= 1", IRNode.VECTOR_CAST_F2HF, IRNode.VECTOR_SIZE_ANY, " >= 1"}) + @IR(applyIfCPUFeatureAnd = {"asimd", "true", "fphp", "false", "asimdhp", "false"}, + counts = {IRNode.VECTOR_CAST_HF2F, IRNode.VECTOR_SIZE_ANY, ">= 1", IRNode.VECTOR_CAST_F2HF, IRNode.VECTOR_SIZE_ANY, " >= 1"}) public static void test(short [] res, short [] src1, short [] src2) { for (int i = 0; i < res.length; i++) { res[i] = (short)Float.float16ToFloat(Float.floatToFloat16(Float.float16ToFloat(src1[i]) + Float.float16ToFloat(src2[i])));