diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 37aea5f6657f01..45da8af51bb9ce 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -119,6 +119,10 @@ clang/test/AST/Interp/ @tbaederr /mlir/test/python/ @ftynse @makslevental @stellaraccident /mlir/python/ @ftynse @makslevental @stellaraccident +# MLIR Mem2Reg/SROA +/mlir/**/Transforms/Mem2Reg.* @moxinilian +/mlir/**/Transforms/SROA.* @moxinilian + # BOLT /bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s index 0ab67d07a854ec..313e103b19c05a 100644 --- a/bolt/test/AArch64/constant_island_pie_update.s +++ b/bolt/test/AArch64/constant_island_pie_update.s @@ -18,7 +18,7 @@ # RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s # RUN: llvm-objdump -j .text -d %t.relr.bolt | \ # RUN: FileCheck %s --check-prefix=ADDENDCHECK -# RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=ELFCHECK %s +# RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=RELRELFCHECK %s # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s // Check that the CI value was updated @@ -51,6 +51,12 @@ # ELFCHECK-NEXT: {{.*}} R_AARCH64_RELATIVE # ELFCHECK: {{.*}}[[#OFF]] {{.*}} $d +# RELRELFCHECK: $d{{$}} +# RELRELFCHECK-NEXT: $d + 0x8{{$}} +# RELRELFCHECK-NEXT: $d + 0x18{{$}} +# RELRELFCHECK-NEXT: mytextP +# RELRELFCHECK-EMPTY: + // Check that .relr.dyn size is 2 bytes to ensure that last 3 relocations were // encoded as a bitmap so the total section size for 3 relocations is 2 bytes. # RELRSZCHECK: .relr.dyn RELR [[#%x,ADDR:]] [[#%x,OFF:]] {{0*}}10 diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp index 84233c36e15d98..2afff2929cf79c 100644 --- a/clang-tools-extra/clang-doc/Representation.cpp +++ b/clang-tools-extra/clang-doc/Representation.cpp @@ -380,8 +380,8 @@ ClangDocContext::ClangDocContext(tooling::ExecutionContext *ECtx, this->SourceRoot = std::string(SourceRootDir); if (!RepositoryUrl.empty()) { this->RepositoryUrl = std::string(RepositoryUrl); - if (!RepositoryUrl.empty() && RepositoryUrl.find("http://") != 0 && - RepositoryUrl.find("https://") != 0) + if (!RepositoryUrl.empty() && !RepositoryUrl.starts_with("http://") && + !RepositoryUrl.starts_with("https://")) this->RepositoryUrl->insert(0, "https://"); } } diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp index 5260a8b4ecb0ba..32f5edddfe80b1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.cpp @@ -8,7 +8,9 @@ #include "LambdaFunctionNameCheck.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/DeclCXX.h" #include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/Preprocessor.h" @@ -56,6 +58,8 @@ class MacroExpansionsWithFileAndLine : public PPCallbacks { LambdaFunctionNameCheck::SourceRangeSet* SuppressMacroExpansions; }; +AST_MATCHER(CXXMethodDecl, isInLambda) { return Node.getParent()->isLambda(); } + } // namespace LambdaFunctionNameCheck::LambdaFunctionNameCheck(StringRef Name, @@ -69,9 +73,13 @@ void LambdaFunctionNameCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { } void LambdaFunctionNameCheck::registerMatchers(MatchFinder *Finder) { - // Match on PredefinedExprs inside a lambda. - Finder->addMatcher(predefinedExpr(hasAncestor(lambdaExpr())).bind("E"), - this); + Finder->addMatcher( + cxxMethodDecl(isInLambda(), + hasBody(forEachDescendant( + predefinedExpr(hasAncestor(cxxMethodDecl().bind("fn"))) + .bind("E"))), + equalsBoundNode("fn")), + this); } void LambdaFunctionNameCheck::registerPPCallbacks( diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt index e432db8d0912e7..7f1ae5c43d80c6 100644 --- a/clang-tools-extra/clangd/unittests/CMakeLists.txt +++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS support AllTargetsInfos FrontendOpenMP + TargetParser ) if(CLANG_BUILT_STANDALONE) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index a457e6fcae9462..9ef1d38d3c4560 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -155,6 +155,10 @@ Changes in existing checks ` check to ignore code within unevaluated contexts, such as ``decltype``. +- Improved :doc:`bugprone-lambda-function-name` + check by ignoring ``__func__`` macro in lambda captures, initializers of + default parameters and nested function declarations. + - Improved :doc:`bugprone-non-zero-enum-to-bool-conversion ` check by eliminating false positives resulting from direct usage of bitwise operators diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 8bc46acad56c84..3a06d7c30c9b79 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -341,9 +341,9 @@ Clang-Tidy Checks :doc:`portability-std-allocator-const `, :doc:`readability-avoid-const-params-in-decls `, "Yes" :doc:`readability-avoid-nested-conditional-operator `, - :doc:`readability-avoid-return-with-void-value `, + :doc:`readability-avoid-return-with-void-value `, "Yes" :doc:`readability-avoid-unconditional-preprocessor-if `, - :doc:`readability-braces-around-statements `, "Yes" + :doc:`readability-braces-around-statements `, :doc:`readability-const-return-type `, "Yes" :doc:`readability-container-contains `, "Yes" :doc:`readability-container-data-pointer `, "Yes" @@ -529,12 +529,12 @@ Clang-Tidy Checks :doc:`cppcoreguidelines-non-private-member-variables-in-classes `, :doc:`misc-non-private-member-variables-in-classes `, :doc:`cppcoreguidelines-use-default-member-init `, :doc:`modernize-use-default-member-init `, "Yes" :doc:`fuchsia-header-anon-namespaces `, :doc:`google-build-namespaces `, - :doc:`google-readability-braces-around-statements `, :doc:`readability-braces-around-statements `, "Yes" + :doc:`google-readability-braces-around-statements `, :doc:`readability-braces-around-statements `, :doc:`google-readability-function-size `, :doc:`readability-function-size `, :doc:`google-readability-namespace-comments `, :doc:`llvm-namespace-comment `, :doc:`hicpp-avoid-c-arrays `, :doc:`modernize-avoid-c-arrays `, :doc:`hicpp-avoid-goto `, :doc:`cppcoreguidelines-avoid-goto `, - :doc:`hicpp-braces-around-statements `, :doc:`readability-braces-around-statements `, "Yes" + :doc:`hicpp-braces-around-statements `, :doc:`readability-braces-around-statements `, :doc:`hicpp-deprecated-headers `, :doc:`modernize-deprecated-headers `, "Yes" :doc:`hicpp-explicit-conversions `, :doc:`google-explicit-constructor `, "Yes" :doc:`hicpp-function-size `, :doc:`readability-function-size `, diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp index 936ee87a856cd2..5c2bb5713239ce 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/lambda-function-name.cpp @@ -19,6 +19,22 @@ void Positives() { // CHECK-MESSAGES-NO-CONFIG: :[[@LINE-1]]:8: warning: inside a lambda, '__FUNCTION__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] [] { EMBED_IN_ANOTHER_MACRO1; }(); // CHECK-MESSAGES-NO-CONFIG: :[[@LINE-1]]:8: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + [] { + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + struct S { + void f() { + __func__; + [] { + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + }(); + __func__; + } + }; + __func__; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: inside a lambda, '__func__' expands to the name of the function call operator; consider capturing the name of the enclosing function explicitly [bugprone-lambda-function-name] + }(); } #define FUNC_MACRO_WITH_FILE_AND_LINE Foo(__func__, __FILE__, __LINE__) @@ -40,4 +56,7 @@ void Negatives() { [] { FUNC_MACRO_WITH_FILE_AND_LINE; }(); [] { FUNCTION_MACRO_WITH_FILE_AND_LINE; }(); [] { EMBED_IN_ANOTHER_MACRO2; }(); + + [] (const char* func = __func__) { func; }(); + [func=__func__] { func; }(); } diff --git a/clang/cmake/caches/Apple-stage2.cmake b/clang/cmake/caches/Apple-stage2.cmake index ede256a2da6b8f..e919c56739679e 100644 --- a/clang/cmake/caches/Apple-stage2.cmake +++ b/clang/cmake/caches/Apple-stage2.cmake @@ -16,6 +16,7 @@ set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "") set(LLVM_ENABLE_MODULES ON CACHE BOOL "") set(LLVM_EXTERNALIZE_DEBUGINFO ON CACHE BOOL "") set(LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES OFF CACHE BOOL "") +set(LLVM_PLUGIN_SUPPORT OFF CACHE BOOL "") set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "") set(CLANG_SPAWN_CC1 ON CACHE BOOL "") set(BUG_REPORT_URL "http://developer.apple.com/bugreporter/" CACHE STRING "") diff --git a/clang/docs/ClangOffloadBundler.rst b/clang/docs/ClangOffloadBundler.rst index 1f8c85a08f8c79..515e6c00a3b800 100644 --- a/clang/docs/ClangOffloadBundler.rst +++ b/clang/docs/ClangOffloadBundler.rst @@ -518,11 +518,14 @@ The compressed offload bundle begins with a header followed by the compressed bi This is a unique identifier to distinguish compressed offload bundles. The value is the string 'CCOB' (Compressed Clang Offload Bundle). - **Version Number (16-bit unsigned int)**: - This denotes the version of the compressed offload bundle format. The current version is `1`. + This denotes the version of the compressed offload bundle format. The current version is `2`. - **Compression Method (16-bit unsigned int)**: This field indicates the compression method used. The value corresponds to either `zlib` or `zstd`, represented as a 16-bit unsigned integer cast from the LLVM compression enumeration. +- **Total File Size (32-bit unsigned int)**: + This is the total size (in bytes) of the file, including the header. Available in version 2 and above. + - **Uncompressed Binary Size (32-bit unsigned int)**: This is the size (in bytes) of the binary data before it was compressed. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 193bbd6b1a4702..c44f238e33846b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -549,6 +549,7 @@ Bug Fixes to C++ Support - Fix a crash in requires expression with templated base class member function. Fixes (#GH84020). - Fix a crash caused by defined struct in a type alias template when the structure has fields with dependent type. Fixes (#GH75221). +- Fix the Itanium mangling of lambdas defined in a member of a local class (#GH88906) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -558,6 +559,9 @@ Bug Fixes to AST Handling Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed an infinite recursion in ASTImporter, on return type declared inside + body of C++11 lambda without trailing return (#GH68775). + Miscellaneous Clang Crashes Fixed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -695,6 +699,8 @@ Static Analyzer - Support C++23 static operator calls. (#GH84972) - Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally matched user-defined ``strerror`` and similar library functions. (GH#88181) +- Fixed a crash when storing through an address that refers to the address of + a label. (GH#89185) New features ^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 0a9c9e17d3f9f9..8b121896d66d15 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -157,7 +157,7 @@ class PragmaCommentDecl final SourceLocation CommentLoc, PragmaMSCommentKind CommentKind, StringRef Arg); - static PragmaCommentDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static PragmaCommentDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned ArgSize); PragmaMSCommentKind getCommentKind() const { return CommentKind; } @@ -192,7 +192,7 @@ class PragmaDetectMismatchDecl final SourceLocation Loc, StringRef Name, StringRef Value); static PragmaDetectMismatchDecl * - CreateDeserialized(ASTContext &C, unsigned ID, unsigned NameValueSize); + CreateDeserialized(ASTContext &C, DeclID ID, unsigned NameValueSize); StringRef getName() const { return getTrailingObjects(); } StringRef getValue() const { return getTrailingObjects() + ValueStart; } @@ -518,7 +518,7 @@ class LabelDecl : public NamedDecl { static LabelDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation IdentL, IdentifierInfo *II, SourceLocation GnuLabelL); - static LabelDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static LabelDecl *CreateDeserialized(ASTContext &C, DeclID ID); LabelStmt *getStmt() const { return TheStmt; } void setStmt(LabelStmt *T) { TheStmt = T; } @@ -581,7 +581,7 @@ class NamespaceDecl : public NamedDecl, public DeclContext, IdentifierInfo *Id, NamespaceDecl *PrevDecl, bool Nested); - static NamespaceDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static NamespaceDecl *CreateDeserialized(ASTContext &C, DeclID ID); using redecl_range = redeclarable_base::redecl_range; using redecl_iterator = redeclarable_base::redecl_iterator; @@ -1146,7 +1146,7 @@ class VarDecl : public DeclaratorDecl, public Redeclarable { const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S); - static VarDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static VarDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -1728,7 +1728,7 @@ class ImplicitParamDecl : public VarDecl { static ImplicitParamDecl *Create(ASTContext &C, QualType T, ImplicitParamKind ParamKind); - static ImplicitParamDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ImplicitParamDecl *CreateDeserialized(ASTContext &C, DeclID ID); ImplicitParamDecl(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, const IdentifierInfo *Id, QualType Type, @@ -1782,7 +1782,7 @@ class ParmVarDecl : public VarDecl { TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg); - static ParmVarDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ParmVarDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -2178,7 +2178,7 @@ class FunctionDecl : public DeclaratorDecl, bool hasWrittenPrototype, ConstexprSpecKind ConstexprKind, Expr *TrailingRequiresClause); - static FunctionDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static FunctionDecl *CreateDeserialized(ASTContext &C, DeclID ID); DeclarationNameInfo getNameInfo() const { return DeclarationNameInfo(getDeclName(), getLocation(), DNLoc); @@ -3136,7 +3136,7 @@ class FieldDecl : public DeclaratorDecl, public Mergeable { TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle); - static FieldDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static FieldDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Returns the index of this field within its record, /// as appropriate for passing to ASTRecordLayout::getFieldOffset. @@ -3311,7 +3311,7 @@ class EnumConstantDecl : public ValueDecl, SourceLocation L, IdentifierInfo *Id, QualType T, Expr *E, const llvm::APSInt &V); - static EnumConstantDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static EnumConstantDecl *CreateDeserialized(ASTContext &C, DeclID ID); const Expr *getInitExpr() const { return (const Expr*) Init; } Expr *getInitExpr() { return (Expr*) Init; } @@ -3357,7 +3357,7 @@ class IndirectFieldDecl : public ValueDecl, QualType T, llvm::MutableArrayRef CH); - static IndirectFieldDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static IndirectFieldDecl *CreateDeserialized(ASTContext &C, DeclID ID); using chain_iterator = ArrayRef::const_iterator; @@ -3542,7 +3542,7 @@ class TypedefDecl : public TypedefNameDecl { static TypedefDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, TypeSourceInfo *TInfo); - static TypedefDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static TypedefDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -3567,7 +3567,7 @@ class TypeAliasDecl : public TypedefNameDecl { static TypeAliasDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, TypeSourceInfo *TInfo); - static TypeAliasDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static TypeAliasDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -3977,7 +3977,7 @@ class EnumDecl : public TagDecl { IdentifierInfo *Id, EnumDecl *PrevDecl, bool IsScoped, bool IsScopedUsingClassTag, bool IsFixed); - static EnumDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static EnumDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Overrides to provide correct range when there's an enum-base specifier /// with forward declarations. @@ -4182,7 +4182,7 @@ class RecordDecl : public TagDecl { static RecordDecl *Create(const ASTContext &C, TagKind TK, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, RecordDecl* PrevDecl = nullptr); - static RecordDecl *CreateDeserialized(const ASTContext &C, unsigned ID); + static RecordDecl *CreateDeserialized(const ASTContext &C, DeclID ID); RecordDecl *getPreviousDecl() { return cast_or_null( @@ -4433,7 +4433,7 @@ class FileScopeAsmDecl : public Decl { StringLiteral *Str, SourceLocation AsmLoc, SourceLocation RParenLoc); - static FileScopeAsmDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static FileScopeAsmDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceLocation getAsmLoc() const { return getLocation(); } SourceLocation getRParenLoc() const { return RParenLoc; } @@ -4469,7 +4469,7 @@ class TopLevelStmtDecl : public Decl, public DeclContext { public: static TopLevelStmtDecl *Create(ASTContext &C, Stmt *Statement); - static TopLevelStmtDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static TopLevelStmtDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; Stmt *getStmt() { return Statement; } @@ -4563,7 +4563,7 @@ class BlockDecl : public Decl, public DeclContext { public: static BlockDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L); - static BlockDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static BlockDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceLocation getCaretLocation() const { return getLocation(); } @@ -4717,7 +4717,7 @@ class CapturedDecl final static CapturedDecl *Create(ASTContext &C, DeclContext *DC, unsigned NumParams); - static CapturedDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static CapturedDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned NumParams); Stmt *getBody() const override; @@ -4851,7 +4851,7 @@ class ImportDecl final : public Decl, SourceLocation EndLoc); /// Create a new, deserialized module import declaration. - static ImportDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static ImportDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned NumLocations); /// Retrieve the module that was imported by the import declaration. @@ -4892,7 +4892,7 @@ class ExportDecl final : public Decl, public DeclContext { public: static ExportDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation ExportLoc); - static ExportDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ExportDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceLocation getExportLoc() const { return getLocation(); } SourceLocation getRBraceLoc() const { return RBraceLoc; } @@ -4931,7 +4931,7 @@ class EmptyDecl : public Decl { public: static EmptyDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L); - static EmptyDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static EmptyDecl *CreateDeserialized(ASTContext &C, DeclID ID); static bool classof(const Decl *D) { return classofKind(D->getKind()); } static bool classofKind(Kind K) { return K == Empty; } @@ -4957,7 +4957,7 @@ class HLSLBufferDecl final : public NamedDecl, public DeclContext { bool CBuffer, SourceLocation KwLoc, IdentifierInfo *ID, SourceLocation IDLoc, SourceLocation LBrace); - static HLSLBufferDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static HLSLBufferDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY { return SourceRange(getLocStart(), RBraceLoc); diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 1079993f496945..161e14fc896922 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -349,6 +349,8 @@ class alignas(8) Decl { LLVM_PREFERRED_TYPE(Linkage) mutable unsigned CacheValidAndLinkage : 3; + using DeclID = uint32_t; + /// Allocate memory for a deserialized declaration. /// /// This routine must be used to allocate memory for any declaration that is @@ -358,7 +360,7 @@ class alignas(8) Decl { /// \param Ctx The context in which we will allocate memory. /// \param ID The global ID of the deserialized declaration. /// \param Extra The amount of extra space to allocate after the object. - void *operator new(std::size_t Size, const ASTContext &Ctx, unsigned ID, + void *operator new(std::size_t Size, const ASTContext &Ctx, DeclID ID, std::size_t Extra = 0); /// Allocate memory for a non-deserialized declaration. diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 7aed4d5cbc002e..a7644d2a19d245 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -120,7 +120,7 @@ class AccessSpecDecl : public Decl { return new (C, DC) AccessSpecDecl(AS, DC, ASLoc, ColonLoc); } - static AccessSpecDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static AccessSpecDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Implement isa/cast/dyncast/etc. static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -579,7 +579,7 @@ class CXXRecordDecl : public RecordDecl { TypeSourceInfo *Info, SourceLocation Loc, unsigned DependencyKind, bool IsGeneric, LambdaCaptureDefault CaptureDefault); - static CXXRecordDecl *CreateDeserialized(const ASTContext &C, unsigned ID); + static CXXRecordDecl *CreateDeserialized(const ASTContext &C, DeclID ID); bool isDynamicClass() const { return data().Polymorphic || data().NumVBases != 0; @@ -1980,7 +1980,7 @@ class CXXDeductionGuideDecl : public FunctionDecl { CXXConstructorDecl *Ctor = nullptr, DeductionCandidate Kind = DeductionCandidate::Normal); - static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C, DeclID ID); ExplicitSpecifier getExplicitSpecifier() { return ExplicitSpec; } const ExplicitSpecifier getExplicitSpecifier() const { return ExplicitSpec; } @@ -2035,7 +2035,7 @@ class RequiresExprBodyDecl : public Decl, public DeclContext { static RequiresExprBodyDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc); - static RequiresExprBodyDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static RequiresExprBodyDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Implement isa/cast/dyncast/etc. static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -2078,7 +2078,7 @@ class CXXMethodDecl : public FunctionDecl { ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, Expr *TrailingRequiresClause = nullptr); - static CXXMethodDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static CXXMethodDecl *CreateDeserialized(ASTContext &C, DeclID ID); bool isStatic() const; bool isInstance() const { return !isStatic(); } @@ -2579,7 +2579,7 @@ class CXXConstructorDecl final friend class ASTDeclWriter; friend TrailingObjects; - static CXXConstructorDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static CXXConstructorDecl *CreateDeserialized(ASTContext &C, DeclID ID, uint64_t AllocKind); static CXXConstructorDecl * Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, @@ -2822,7 +2822,7 @@ class CXXDestructorDecl : public CXXMethodDecl { bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, Expr *TrailingRequiresClause = nullptr); - static CXXDestructorDecl *CreateDeserialized(ASTContext & C, unsigned ID); + static CXXDestructorDecl *CreateDeserialized(ASTContext & C, DeclID ID); void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg); @@ -2881,7 +2881,7 @@ class CXXConversionDecl : public CXXMethodDecl { bool UsesFPIntrin, bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, Expr *TrailingRequiresClause = nullptr); - static CXXConversionDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static CXXConversionDecl *CreateDeserialized(ASTContext &C, DeclID ID); ExplicitSpecifier getExplicitSpecifier() { return getCanonicalDecl()->ExplicitSpec; @@ -2948,7 +2948,7 @@ class LinkageSpecDecl : public Decl, public DeclContext { SourceLocation ExternLoc, SourceLocation LangLoc, LinkageSpecLanguageIDs Lang, bool HasBraces); - static LinkageSpecDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static LinkageSpecDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Return the language specified by this linkage specification. LinkageSpecLanguageIDs getLanguage() const { @@ -3096,7 +3096,7 @@ class UsingDirectiveDecl : public NamedDecl { SourceLocation IdentLoc, NamedDecl *Nominated, DeclContext *CommonAncestor); - static UsingDirectiveDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static UsingDirectiveDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY { return SourceRange(UsingLoc, getLocation()); @@ -3157,7 +3157,7 @@ class NamespaceAliasDecl : public NamedDecl, SourceLocation IdentLoc, NamedDecl *Namespace); - static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, DeclID ID); using redecl_range = redeclarable_base::redecl_range; using redecl_iterator = redeclarable_base::redecl_iterator; @@ -3254,7 +3254,7 @@ class LifetimeExtendedTemporaryDecl final LifetimeExtendedTemporaryDecl(Temp, EDec, Mangling); } static LifetimeExtendedTemporaryDecl *CreateDeserialized(ASTContext &C, - unsigned ID) { + DeclID ID) { return new (C, ID) LifetimeExtendedTemporaryDecl(EmptyShell{}); } @@ -3357,7 +3357,7 @@ class UsingShadowDecl : public NamedDecl, public Redeclarable { UsingShadowDecl(UsingShadow, C, DC, Loc, Name, Introducer, Target); } - static UsingShadowDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static UsingShadowDecl *CreateDeserialized(ASTContext &C, DeclID ID); using redecl_range = redeclarable_base::redecl_range; using redecl_iterator = redeclarable_base::redecl_iterator; @@ -3566,7 +3566,7 @@ class UsingDecl : public BaseUsingDecl, public Mergeable { const DeclarationNameInfo &NameInfo, bool HasTypenameKeyword); - static UsingDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static UsingDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -3645,7 +3645,7 @@ class ConstructorUsingShadowDecl final : public UsingShadowDecl { UsingDecl *Using, NamedDecl *Target, bool IsVirtual); static ConstructorUsingShadowDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); /// Override the UsingShadowDecl's getIntroducer, returning the UsingDecl that /// introduced this. @@ -3757,7 +3757,7 @@ class UsingEnumDecl : public BaseUsingDecl, public Mergeable { SourceLocation UsingL, SourceLocation EnumL, SourceLocation NameL, TypeSourceInfo *EnumType); - static UsingEnumDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static UsingEnumDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -3830,7 +3830,7 @@ class UsingPackDecl final NamedDecl *InstantiatedFrom, ArrayRef UsingDecls); - static UsingPackDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static UsingPackDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned NumExpansions); SourceRange getSourceRange() const override LLVM_READONLY { @@ -3924,7 +3924,7 @@ class UnresolvedUsingValueDecl : public ValueDecl, const DeclarationNameInfo &NameInfo, SourceLocation EllipsisLoc); static UnresolvedUsingValueDecl * - CreateDeserialized(ASTContext &C, unsigned ID); + CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -4015,7 +4015,7 @@ class UnresolvedUsingTypenameDecl SourceLocation EllipsisLoc); static UnresolvedUsingTypenameDecl * - CreateDeserialized(ASTContext &C, unsigned ID); + CreateDeserialized(ASTContext &C, DeclID ID); /// Retrieves the canonical declaration of this declaration. UnresolvedUsingTypenameDecl *getCanonicalDecl() override { @@ -4045,7 +4045,7 @@ class UnresolvedUsingIfExistsDecl final : public NamedDecl { SourceLocation Loc, DeclarationName Name); static UnresolvedUsingIfExistsDecl *CreateDeserialized(ASTContext &Ctx, - unsigned ID); + DeclID ID); static bool classof(const Decl *D) { return classofKind(D->getKind()); } static bool classofKind(Kind K) { return K == Decl::UnresolvedUsingIfExists; } @@ -4073,7 +4073,7 @@ class StaticAssertDecl : public Decl { SourceLocation StaticAssertLoc, Expr *AssertExpr, Expr *Message, SourceLocation RParenLoc, bool Failed); - static StaticAssertDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static StaticAssertDecl *CreateDeserialized(ASTContext &C, DeclID ID); Expr *getAssertExpr() { return AssertExprAndFailed.getPointer(); } const Expr *getAssertExpr() const { return AssertExprAndFailed.getPointer(); } @@ -4120,7 +4120,7 @@ class BindingDecl : public ValueDecl { static BindingDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id); - static BindingDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static BindingDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Get the expression to which this declaration is bound. This may be null /// in two different cases: while parsing the initializer for the @@ -4189,7 +4189,7 @@ class DecompositionDecl final QualType T, TypeSourceInfo *TInfo, StorageClass S, ArrayRef Bindings); - static DecompositionDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static DecompositionDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned NumBindings); ArrayRef bindings() const { @@ -4246,7 +4246,7 @@ class MSPropertyDecl : public DeclaratorDecl { SourceLocation L, DeclarationName N, QualType T, TypeSourceInfo *TInfo, SourceLocation StartL, IdentifierInfo *Getter, IdentifierInfo *Setter); - static MSPropertyDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static MSPropertyDecl *CreateDeserialized(ASTContext &C, DeclID ID); static bool classof(const Decl *D) { return D->getKind() == MSProperty; } @@ -4300,7 +4300,7 @@ class MSGuidDecl : public ValueDecl, MSGuidDecl(DeclContext *DC, QualType T, Parts P); static MSGuidDecl *Create(const ASTContext &C, QualType T, Parts P); - static MSGuidDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static MSGuidDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Only ASTContext::getMSGuidDecl and deserialization create these. friend class ASTContext; @@ -4353,7 +4353,7 @@ class UnnamedGlobalConstantDecl : public ValueDecl, static UnnamedGlobalConstantDecl *Create(const ASTContext &C, QualType T, const APValue &APVal); static UnnamedGlobalConstantDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); // Only ASTContext::getUnnamedGlobalConstantDecl and deserialization create // these. diff --git a/clang/include/clang/AST/DeclFriend.h b/clang/include/clang/AST/DeclFriend.h index 3e6ca5b3219259..b56627a5337d63 100644 --- a/clang/include/clang/AST/DeclFriend.h +++ b/clang/include/clang/AST/DeclFriend.h @@ -112,7 +112,7 @@ class FriendDecl final Create(ASTContext &C, DeclContext *DC, SourceLocation L, FriendUnion Friend_, SourceLocation FriendL, ArrayRef FriendTypeTPLists = std::nullopt); - static FriendDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static FriendDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned FriendTypeNumTPLists); /// If this friend declaration names an (untemplated but possibly diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h index b8d17dd06d1550..7780afa6f1cf5c 100644 --- a/clang/include/clang/AST/DeclObjC.h +++ b/clang/include/clang/AST/DeclObjC.h @@ -236,7 +236,7 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext { ObjCImplementationControl impControl = ObjCImplementationControl::None, bool HasRelatedResultType = false); - static ObjCMethodDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCMethodDecl *CreateDeserialized(ASTContext &C, DeclID ID); ObjCMethodDecl *getCanonicalDecl() override; const ObjCMethodDecl *getCanonicalDecl() const { @@ -614,7 +614,7 @@ class ObjCTypeParamDecl : public TypedefNameDecl { IdentifierInfo *name, SourceLocation colonLoc, TypeSourceInfo *boundInfo); - static ObjCTypeParamDecl *CreateDeserialized(ASTContext &ctx, unsigned ID); + static ObjCTypeParamDecl *CreateDeserialized(ASTContext &ctx, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -789,7 +789,7 @@ class ObjCPropertyDecl : public NamedDecl { TypeSourceInfo *TSI, PropertyControl propControl = None); - static ObjCPropertyDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCPropertyDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceLocation getAtLoc() const { return AtLoc; } void setAtLoc(SourceLocation L) { AtLoc = L; } @@ -1279,7 +1279,7 @@ class ObjCInterfaceDecl : public ObjCContainerDecl ObjCInterfaceDecl *PrevDecl, SourceLocation ClassLoc = SourceLocation(), bool isInternal = false); - static ObjCInterfaceDecl *CreateDeserialized(const ASTContext &C, unsigned ID); + static ObjCInterfaceDecl *CreateDeserialized(const ASTContext &C, DeclID ID); /// Retrieve the type parameters of this class. /// @@ -1969,7 +1969,7 @@ class ObjCIvarDecl : public FieldDecl { TypeSourceInfo *TInfo, AccessControl ac, Expr *BW = nullptr, bool synthesized = false); - static ObjCIvarDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCIvarDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Return the class interface that this ivar is logically contained /// in; this is either the interface where the ivar was declared, or the @@ -2039,7 +2039,7 @@ class ObjCAtDefsFieldDecl : public FieldDecl { SourceLocation IdLoc, IdentifierInfo *Id, QualType T, Expr *BW); - static ObjCAtDefsFieldDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCAtDefsFieldDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Implement isa/cast/dyncast/etc. static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -2142,7 +2142,7 @@ class ObjCProtocolDecl : public ObjCContainerDecl, SourceLocation atStartLoc, ObjCProtocolDecl *PrevDecl); - static ObjCProtocolDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCProtocolDecl *CreateDeserialized(ASTContext &C, DeclID ID); const ObjCProtocolList &getReferencedProtocols() const { assert(hasDefinition() && "No definition available!"); @@ -2361,7 +2361,7 @@ class ObjCCategoryDecl : public ObjCContainerDecl { ObjCTypeParamList *typeParamList, SourceLocation IvarLBraceLoc = SourceLocation(), SourceLocation IvarRBraceLoc = SourceLocation()); - static ObjCCategoryDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCCategoryDecl *CreateDeserialized(ASTContext &C, DeclID ID); ObjCInterfaceDecl *getClassInterface() { return ClassInterface; } const ObjCInterfaceDecl *getClassInterface() const { return ClassInterface; } @@ -2558,7 +2558,7 @@ class ObjCCategoryImplDecl : public ObjCImplDecl { Create(ASTContext &C, DeclContext *DC, const IdentifierInfo *Id, ObjCInterfaceDecl *classInterface, SourceLocation nameLoc, SourceLocation atStartLoc, SourceLocation CategoryNameLoc); - static ObjCCategoryImplDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCCategoryImplDecl *CreateDeserialized(ASTContext &C, DeclID ID); ObjCCategoryDecl *getCategoryDecl() const; @@ -2640,7 +2640,7 @@ class ObjCImplementationDecl : public ObjCImplDecl { SourceLocation IvarLBraceLoc=SourceLocation(), SourceLocation IvarRBraceLoc=SourceLocation()); - static ObjCImplementationDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCImplementationDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// init_iterator - Iterates through the ivar initializer list. using init_iterator = CXXCtorInitializer **; @@ -2780,7 +2780,7 @@ class ObjCCompatibleAliasDecl : public NamedDecl { ObjCInterfaceDecl* aliasedClass); static ObjCCompatibleAliasDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); const ObjCInterfaceDecl *getClassInterface() const { return AliasedClass; } ObjCInterfaceDecl *getClassInterface() { return AliasedClass; } @@ -2851,7 +2851,7 @@ class ObjCPropertyImplDecl : public Decl { ObjCIvarDecl *ivarDecl, SourceLocation ivarLoc); - static ObjCPropertyImplDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ObjCPropertyImplDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h index 8fdfddb6c1fd74..c7ede7f2157fef 100644 --- a/clang/include/clang/AST/DeclOpenMP.h +++ b/clang/include/clang/AST/DeclOpenMP.h @@ -133,7 +133,7 @@ class OMPThreadPrivateDecl final : public OMPDeclarativeDirective { SourceLocation L, ArrayRef VL); static OMPThreadPrivateDecl *CreateDeserialized(ASTContext &C, - unsigned ID, unsigned N); + DeclID ID, unsigned N); typedef MutableArrayRef::iterator varlist_iterator; typedef ArrayRef::iterator varlist_const_iterator; @@ -214,7 +214,7 @@ class OMPDeclareReductionDecl final : public ValueDecl, public DeclContext { QualType T, OMPDeclareReductionDecl *PrevDeclInScope); /// Create deserialized declare reduction node. static OMPDeclareReductionDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); /// Get combiner expression of the declare reduction construct. Expr *getCombiner() { return Combiner; } @@ -318,7 +318,7 @@ class OMPDeclareMapperDecl final : public OMPDeclarativeDirective, ArrayRef Clauses, OMPDeclareMapperDecl *PrevDeclInScope); /// Creates deserialized declare mapper node. - static OMPDeclareMapperDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static OMPDeclareMapperDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned N); using clauselist_iterator = MutableArrayRef::iterator; @@ -397,7 +397,7 @@ class OMPCapturedExprDecl final : public VarDecl { IdentifierInfo *Id, QualType T, SourceLocation StartLoc); - static OMPCapturedExprDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static OMPCapturedExprDecl *CreateDeserialized(ASTContext &C, DeclID ID); SourceRange getSourceRange() const override LLVM_READONLY; @@ -427,7 +427,7 @@ class OMPRequiresDecl final : public OMPDeclarativeDirective { static OMPRequiresDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L, ArrayRef CL); /// Create deserialized requires node. - static OMPRequiresDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static OMPRequiresDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned N); using clauselist_iterator = MutableArrayRef::iterator; @@ -495,7 +495,7 @@ class OMPAllocateDecl final : public OMPDeclarativeDirective { static OMPAllocateDecl *Create(ASTContext &C, DeclContext *DC, SourceLocation L, ArrayRef VL, ArrayRef CL); - static OMPAllocateDecl *CreateDeserialized(ASTContext &C, unsigned ID, + static OMPAllocateDecl *CreateDeserialized(ASTContext &C, DeclID ID, unsigned NVars, unsigned NClauses); typedef MutableArrayRef::iterator varlist_iterator; diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index f24e71ff229648..e2afff8d445016 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -1087,7 +1087,7 @@ class FunctionTemplateDecl : public RedeclarableTemplateDecl { NamedDecl *Decl); /// Create an empty function template node. - static FunctionTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static FunctionTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Implement isa/cast/dyncast support static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -1204,9 +1204,9 @@ class TemplateTypeParmDecl final : public TypeDecl, bool Typename, bool ParameterPack, bool HasTypeConstraint = false, std::optional NumExpanded = std::nullopt); static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C, - unsigned ID); + DeclID ID); static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C, - unsigned ID, + DeclID ID, bool HasTypeConstraint); /// Whether this template type parameter was declared with @@ -1414,10 +1414,10 @@ class NonTypeTemplateParmDecl final ArrayRef ExpandedTInfos); static NonTypeTemplateParmDecl *CreateDeserialized(ASTContext &C, - unsigned ID, + DeclID ID, bool HasTypeConstraint); static NonTypeTemplateParmDecl *CreateDeserialized(ASTContext &C, - unsigned ID, + DeclID ID, unsigned NumExpandedTypes, bool HasTypeConstraint); @@ -1632,9 +1632,9 @@ class TemplateTemplateParmDecl final ArrayRef Expansions); static TemplateTemplateParmDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); static TemplateTemplateParmDecl *CreateDeserialized(ASTContext &C, - unsigned ID, + DeclID ID, unsigned NumExpansions); using TemplateParmPosition::getDepth; @@ -1858,7 +1858,7 @@ class ClassTemplateSpecializationDecl ArrayRef Args, ClassTemplateSpecializationDecl *PrevDecl); static ClassTemplateSpecializationDecl * - CreateDeserialized(ASTContext &C, unsigned ID); + CreateDeserialized(ASTContext &C, DeclID ID); void getNameForDiagnostic(raw_ostream &OS, const PrintingPolicy &Policy, bool Qualified) const override; @@ -2110,7 +2110,7 @@ class ClassTemplatePartialSpecializationDecl ClassTemplatePartialSpecializationDecl *PrevDecl); static ClassTemplatePartialSpecializationDecl * - CreateDeserialized(ASTContext &C, unsigned ID); + CreateDeserialized(ASTContext &C, DeclID ID); ClassTemplatePartialSpecializationDecl *getMostRecentDecl() { return cast( @@ -2306,7 +2306,7 @@ class ClassTemplateDecl : public RedeclarableTemplateDecl { NamedDecl *Decl); /// Create an empty class template node. - static ClassTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ClassTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Return the specialization with the provided arguments if it exists, /// otherwise return the insertion point. @@ -2472,7 +2472,7 @@ class FriendTemplateDecl : public Decl { MutableArrayRef Params, FriendUnion Friend, SourceLocation FriendLoc); - static FriendTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static FriendTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// If this friend declaration names a templated type (or /// a dependent member type of a templated type), return that @@ -2573,7 +2573,7 @@ class TypeAliasTemplateDecl : public RedeclarableTemplateDecl { NamedDecl *Decl); /// Create an empty alias template node. - static TypeAliasTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static TypeAliasTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID); // Implement isa/cast/dyncast support static bool classof(const Decl *D) { return classofKind(D->getKind()); } @@ -2670,7 +2670,7 @@ class VarTemplateSpecializationDecl : public VarDecl, TypeSourceInfo *TInfo, StorageClass S, ArrayRef Args); static VarTemplateSpecializationDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); void getNameForDiagnostic(raw_ostream &OS, const PrintingPolicy &Policy, bool Qualified) const override; @@ -2901,7 +2901,7 @@ class VarTemplatePartialSpecializationDecl const TemplateArgumentListInfo &ArgInfos); static VarTemplatePartialSpecializationDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); VarTemplatePartialSpecializationDecl *getMostRecentDecl() { return cast( @@ -3078,7 +3078,7 @@ class VarTemplateDecl : public RedeclarableTemplateDecl { VarDecl *Decl); /// Create an empty variable template node. - static VarTemplateDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static VarTemplateDecl *CreateDeserialized(ASTContext &C, DeclID ID); /// Return the specialization with the provided arguments if it exists, /// otherwise return the insertion point. @@ -3183,7 +3183,7 @@ class ConceptDecl : public TemplateDecl, public Mergeable { SourceLocation L, DeclarationName Name, TemplateParameterList *Params, Expr *ConstraintExpr); - static ConceptDecl *CreateDeserialized(ASTContext &C, unsigned ID); + static ConceptDecl *CreateDeserialized(ASTContext &C, DeclID ID); Expr *getConstraintExpr() const { return ConstraintExpr; @@ -3232,7 +3232,7 @@ class ImplicitConceptSpecializationDecl final Create(const ASTContext &C, DeclContext *DC, SourceLocation SL, ArrayRef ConvertedArgs); static ImplicitConceptSpecializationDecl * - CreateDeserialized(const ASTContext &C, unsigned ID, + CreateDeserialized(const ASTContext &C, DeclID ID, unsigned NumTemplateArgs); ArrayRef getTemplateArguments() const { @@ -3275,7 +3275,7 @@ class TemplateParamObjectDecl : public ValueDecl, static TemplateParamObjectDecl *Create(const ASTContext &C, QualType T, const APValue &V); static TemplateParamObjectDecl *CreateDeserialized(ASTContext &C, - unsigned ID); + DeclID ID); /// Only ASTContext::getTemplateParamObjectDecl and deserialization /// create these. diff --git a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h index f9fd3db1fb67fc..05748f300a932f 100644 --- a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h +++ b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h @@ -96,6 +96,9 @@ struct ReferencedDecls { /// Returns declarations that are declared in or referenced from `FD`. ReferencedDecls getReferencedDecls(const FunctionDecl &FD); +/// Returns declarations that are declared in or referenced from `S`. +ReferencedDecls getReferencedDecls(const Stmt &S); + } // namespace dataflow } // namespace clang diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 5251774ff4efd6..47747d8704b6c8 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1412,6 +1412,9 @@ def MultiGPU: DiagGroup<"multi-gpu">; // libc and the CRT to be skipped. def AVRRtlibLinkingQuirks : DiagGroup<"avr-rtlib-linking-quirks">; +// A warning group related to AArch64 SME function attribues. +def AArch64SMEAttributes : DiagGroup<"aarch64-sme-attributes">; + // A warning group for things that will change semantics in the future. def FutureCompat : DiagGroup<"future-compat">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 1a2d8bf4e4eb15..1bbe76ff6bd2ac 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3754,6 +3754,16 @@ def err_sme_definition_using_za_in_non_sme_target : Error< "function using ZA state requires 'sme'">; def err_sme_definition_using_zt0_in_non_sme2_target : Error< "function using ZT0 state requires 'sme2'">; +def warn_sme_streaming_pass_return_vl_to_non_streaming : Warning< + "passing a VL-dependent argument to/from a function that has a different" + " streaming-mode. The streaming and non-streaming vector lengths may be" + " different">, + InGroup, DefaultIgnore; +def warn_sme_locally_streaming_has_vl_args_returns : Warning< + "passing/returning a VL-dependent argument to/from a __arm_locally_streaming" + " function. The streaming and non-streaming vector" + " lengths may be different">, + InGroup, DefaultIgnore; def err_conflicting_attributes_arm_state : Error< "conflicting attributes for state '%0'">; def err_sme_streaming_cannot_be_multiversioned : Error< diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 24b109e32cdd3e..75562284ec7de0 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -224,6 +224,11 @@ class LangOptionsBase { /// - the parameter list of a template template parameter Ver17, + /// Attempt to be ABI-compatible with code generated by Clang 18.0.x. + /// This causes clang to revert some fixes to the mangling of lambdas + /// in the initializers of members of local classes. + Ver18, + /// Conform to the underlying platform's C and C++ ABIs as closely /// as we can. Latest diff --git a/clang/include/clang/Driver/OffloadBundler.h b/clang/include/clang/Driver/OffloadBundler.h index 65d33bfbd2825f..57ecbdcb7d040e 100644 --- a/clang/include/clang/Driver/OffloadBundler.h +++ b/clang/include/clang/Driver/OffloadBundler.h @@ -100,6 +100,7 @@ struct OffloadTargetInfo { // - Version (2 bytes) // - Compression Method (2 bytes) - Uses the values from // llvm::compression::Format. +// - Total file size (4 bytes). Available in version 2 and above. // - Uncompressed Size (4 bytes). // - Truncated MD5 Hash (8 bytes). // - Compressed Data (variable length). @@ -109,13 +110,17 @@ class CompressedOffloadBundle { static inline const size_t MagicSize = 4; static inline const size_t VersionFieldSize = sizeof(uint16_t); static inline const size_t MethodFieldSize = sizeof(uint16_t); - static inline const size_t SizeFieldSize = sizeof(uint32_t); - static inline const size_t HashFieldSize = 8; - static inline const size_t HeaderSize = MagicSize + VersionFieldSize + - MethodFieldSize + SizeFieldSize + - HashFieldSize; + static inline const size_t FileSizeFieldSize = sizeof(uint32_t); + static inline const size_t UncompressedSizeFieldSize = sizeof(uint32_t); + static inline const size_t HashFieldSize = sizeof(uint64_t); + static inline const size_t V1HeaderSize = + MagicSize + VersionFieldSize + MethodFieldSize + + UncompressedSizeFieldSize + HashFieldSize; + static inline const size_t V2HeaderSize = + MagicSize + VersionFieldSize + FileSizeFieldSize + MethodFieldSize + + UncompressedSizeFieldSize + HashFieldSize; static inline const llvm::StringRef MagicNumber = "CCOB"; - static inline const uint16_t Version = 1; + static inline const uint16_t Version = 2; public: static llvm::Expected> diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d83031b05cebc4..52d161703f965e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3100,6 +3100,11 @@ defm modules_skip_header_search_paths : BoolFOption<"modules-skip-header-search- HeaderSearchOpts<"ModulesSkipHeaderSearchPaths">, DefaultFalse, PosFlag, NegFlag, BothFlags<[], [CC1Option]>>; +def fno_modules_prune_non_affecting_module_map_files : + Flag<["-"], "fno-modules-prune-non-affecting-module-map-files">, + Group, Flags<[]>, Visibility<[CC1Option]>, + MarshallingInfoNegativeFlag>, + HelpText<"Do not prune non-affecting module map files when writing module files">; def fincremental_extensions : Flag<["-"], "fincremental-extensions">, @@ -4747,9 +4752,9 @@ def munaligned_symbols : Flag<["-"], "munaligned-symbols">, Group, HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">; def mno_unaligned_symbols : Flag<["-"], "mno-unaligned-symbols">, Group, HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">; -def mstrict_align : Flag<["-"], "mstrict-align">, +def mstrict_align : Flag<["-"], "mstrict-align">, Group, HelpText<"Force all memory accesses to be aligned (AArch64/LoongArch/RISC-V only)">; -def mno_strict_align : Flag<["-"], "mno-strict-align">, +def mno_strict_align : Flag<["-"], "mno-strict-align">, Group, HelpText<"Allow memory accesses to be unaligned (AArch64/LoongArch/RISC-V only)">; def mno_thumb : Flag<["-"], "mno-thumb">, Group; def mrestrict_it: Flag<["-"], "mrestrict-it">, Group, diff --git a/clang/include/clang/Lex/HeaderSearchOptions.h b/clang/include/clang/Lex/HeaderSearchOptions.h index 637dc77e5d957e..e4437ac0e35263 100644 --- a/clang/include/clang/Lex/HeaderSearchOptions.h +++ b/clang/include/clang/Lex/HeaderSearchOptions.h @@ -252,6 +252,10 @@ class HeaderSearchOptions { LLVM_PREFERRED_TYPE(bool) unsigned ModulesSkipPragmaDiagnosticMappings : 1; + /// Whether to prune non-affecting module map files from PCM files. + LLVM_PREFERRED_TYPE(bool) + unsigned ModulesPruneNonAffectingModuleMaps : 1; + LLVM_PREFERRED_TYPE(bool) unsigned ModulesHashContent : 1; @@ -280,7 +284,8 @@ class HeaderSearchOptions { ModulesValidateDiagnosticOptions(true), ModulesSkipDiagnosticOptions(false), ModulesSkipHeaderSearchPaths(false), - ModulesSkipPragmaDiagnosticMappings(false), ModulesHashContent(false), + ModulesSkipPragmaDiagnosticMappings(false), + ModulesPruneNonAffectingModuleMaps(true), ModulesHashContent(false), ModulesStrictContextHash(false), ModulesIncludeVFSUsage(false) {} /// AddPath - Add the \p Path path to the specified \p Group list. diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 43ee06c524b3a0..1cd8b6a357cbf9 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -501,6 +501,8 @@ class ASTReader /// = I + 1 has already been loaded. llvm::PagedVector DeclsLoaded; + static_assert(std::is_same_v); + using GlobalDeclMapType = ContinuousRangeMap; diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 6aaa34c55ce307..f8180047f68609 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -695,7 +695,7 @@ namespace clang { // Returns true if the given function has a placeholder return type and // that type is declared inside the body of the function. // E.g. auto f() { struct X{}; return X(); } - bool hasAutoReturnTypeDeclaredInside(FunctionDecl *D); + bool hasReturnTypeDeclaredInside(FunctionDecl *D); }; template @@ -3647,15 +3647,28 @@ class IsTypeDeclaredInsideVisitor }; } // namespace -/// This function checks if the function has 'auto' return type that contains +/// This function checks if the given function has a return type that contains /// a reference (in any way) to a declaration inside the same function. -bool ASTNodeImporter::hasAutoReturnTypeDeclaredInside(FunctionDecl *D) { +bool ASTNodeImporter::hasReturnTypeDeclaredInside(FunctionDecl *D) { QualType FromTy = D->getType(); const auto *FromFPT = FromTy->getAs(); assert(FromFPT && "Must be called on FunctionProtoType"); + auto IsCXX11LambdaWithouTrailingReturn = [&]() { + if (Importer.FromContext.getLangOpts().CPlusPlus14) // C++14 or later + return false; + + if (FromFPT->hasTrailingReturn()) + return false; + + if (const auto *MD = dyn_cast(D)) + return cast(MD->getDeclContext())->isLambda(); + + return false; + }; + QualType RetT = FromFPT->getReturnType(); - if (isa(RetT.getTypePtr())) { + if (isa(RetT.getTypePtr()) || IsCXX11LambdaWithouTrailingReturn()) { FunctionDecl *Def = D->getDefinition(); IsTypeDeclaredInsideVisitor Visitor(Def ? Def : D); return Visitor.CheckType(RetT); @@ -3811,7 +3824,7 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) { // E.g.: auto foo() { struct X{}; return X(); } // To avoid an infinite recursion when importing, create the FunctionDecl // with a simplified return type. - if (hasAutoReturnTypeDeclaredInside(D)) { + if (hasReturnTypeDeclaredInside(D)) { FromReturnTy = Importer.getFromContext().VoidTy; UsedDifferentProtoType = true; } diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 1b99c435aebb2a..474e0ccde5bbf7 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -2151,7 +2151,7 @@ VarDecl *VarDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartL, return new (C, DC) VarDecl(Var, C, DC, StartL, IdL, Id, T, TInfo, S); } -VarDecl *VarDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +VarDecl *VarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) VarDecl(Var, C, nullptr, SourceLocation(), SourceLocation(), nullptr, QualType(), nullptr, SC_None); @@ -2929,7 +2929,7 @@ QualType ParmVarDecl::getOriginalType() const { return T; } -ParmVarDecl *ParmVarDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ParmVarDecl *ParmVarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ParmVarDecl(ParmVar, C, nullptr, SourceLocation(), SourceLocation(), nullptr, QualType(), nullptr, SC_None, nullptr); @@ -4553,7 +4553,7 @@ FieldDecl *FieldDecl::Create(const ASTContext &C, DeclContext *DC, BW, Mutable, InitStyle); } -FieldDecl *FieldDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +FieldDecl *FieldDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) FieldDecl(Field, nullptr, SourceLocation(), SourceLocation(), nullptr, QualType(), nullptr, nullptr, false, ICIS_NoInit); @@ -4863,7 +4863,7 @@ EnumDecl *EnumDecl::Create(ASTContext &C, DeclContext *DC, return Enum; } -EnumDecl *EnumDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +EnumDecl *EnumDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { EnumDecl *Enum = new (C, ID) EnumDecl(C, nullptr, SourceLocation(), SourceLocation(), nullptr, nullptr, false, false, false); @@ -5025,7 +5025,7 @@ RecordDecl *RecordDecl::Create(const ASTContext &C, TagKind TK, DeclContext *DC, return R; } -RecordDecl *RecordDecl::CreateDeserialized(const ASTContext &C, unsigned ID) { +RecordDecl *RecordDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) { RecordDecl *R = new (C, ID) RecordDecl(Record, TagTypeKind::Struct, C, nullptr, SourceLocation(), SourceLocation(), nullptr, nullptr); @@ -5297,7 +5297,7 @@ PragmaCommentDecl *PragmaCommentDecl::Create(const ASTContext &C, } PragmaCommentDecl *PragmaCommentDecl::CreateDeserialized(ASTContext &C, - unsigned ID, + Decl::DeclID ID, unsigned ArgSize) { return new (C, ID, additionalSizeToAlloc(ArgSize + 1)) PragmaCommentDecl(nullptr, SourceLocation(), PCK_Unknown); @@ -5322,7 +5322,7 @@ PragmaDetectMismatchDecl::Create(const ASTContext &C, TranslationUnitDecl *DC, } PragmaDetectMismatchDecl * -PragmaDetectMismatchDecl::CreateDeserialized(ASTContext &C, unsigned ID, +PragmaDetectMismatchDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NameValueSize) { return new (C, ID, additionalSizeToAlloc(NameValueSize + 1)) PragmaDetectMismatchDecl(nullptr, SourceLocation(), 0); @@ -5349,7 +5349,7 @@ LabelDecl *LabelDecl::Create(ASTContext &C, DeclContext *DC, return new (C, DC) LabelDecl(DC, IdentL, II, nullptr, GnuLabelL); } -LabelDecl *LabelDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +LabelDecl *LabelDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) LabelDecl(nullptr, SourceLocation(), nullptr, nullptr, SourceLocation()); } @@ -5390,7 +5390,7 @@ ImplicitParamDecl *ImplicitParamDecl::Create(ASTContext &C, QualType Type, } ImplicitParamDecl *ImplicitParamDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ImplicitParamDecl(C, QualType(), ImplicitParamKind::Other); } @@ -5408,7 +5408,7 @@ FunctionDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, return New; } -FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) FunctionDecl( Function, C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, SC_None, false, false, ConstexprSpecKind::Unspecified, nullptr); @@ -5418,7 +5418,7 @@ BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) { return new (C, DC) BlockDecl(DC, L); } -BlockDecl *BlockDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +BlockDecl *BlockDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) BlockDecl(nullptr, SourceLocation()); } @@ -5432,7 +5432,7 @@ CapturedDecl *CapturedDecl::Create(ASTContext &C, DeclContext *DC, CapturedDecl(DC, NumParams); } -CapturedDecl *CapturedDecl::CreateDeserialized(ASTContext &C, unsigned ID, +CapturedDecl *CapturedDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NumParams) { return new (C, ID, additionalSizeToAlloc(NumParams)) CapturedDecl(nullptr, NumParams); @@ -5459,7 +5459,7 @@ EnumConstantDecl *EnumConstantDecl::Create(ASTContext &C, EnumDecl *CD, } EnumConstantDecl * -EnumConstantDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +EnumConstantDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) EnumConstantDecl(C, nullptr, SourceLocation(), nullptr, QualType(), nullptr, llvm::APSInt()); } @@ -5486,7 +5486,7 @@ IndirectFieldDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L, } IndirectFieldDecl *IndirectFieldDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) IndirectFieldDecl(C, nullptr, SourceLocation(), DeclarationName(), QualType(), std::nullopt); @@ -5547,7 +5547,7 @@ bool TypedefNameDecl::isTransparentTagSlow() const { return isTransparent; } -TypedefDecl *TypedefDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +TypedefDecl *TypedefDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) TypedefDecl(C, nullptr, SourceLocation(), SourceLocation(), nullptr, nullptr); } @@ -5560,7 +5560,7 @@ TypeAliasDecl *TypeAliasDecl::Create(ASTContext &C, DeclContext *DC, return new (C, DC) TypeAliasDecl(C, DC, StartLoc, IdLoc, Id, TInfo); } -TypeAliasDecl *TypeAliasDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +TypeAliasDecl *TypeAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) TypeAliasDecl(C, nullptr, SourceLocation(), SourceLocation(), nullptr, nullptr); } @@ -5591,7 +5591,7 @@ FileScopeAsmDecl *FileScopeAsmDecl::Create(ASTContext &C, DeclContext *DC, } FileScopeAsmDecl *FileScopeAsmDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) FileScopeAsmDecl(nullptr, nullptr, SourceLocation(), SourceLocation()); } @@ -5609,7 +5609,7 @@ TopLevelStmtDecl *TopLevelStmtDecl::Create(ASTContext &C, Stmt *Statement) { } TopLevelStmtDecl *TopLevelStmtDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) TopLevelStmtDecl(/*DC=*/nullptr, SourceLocation(), /*S=*/nullptr); } @@ -5630,7 +5630,7 @@ EmptyDecl *EmptyDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) { return new (C, DC) EmptyDecl(DC, L); } -EmptyDecl *EmptyDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +EmptyDecl *EmptyDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) EmptyDecl(nullptr, SourceLocation()); } @@ -5663,7 +5663,7 @@ HLSLBufferDecl *HLSLBufferDecl::Create(ASTContext &C, return Result; } -HLSLBufferDecl *HLSLBufferDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +HLSLBufferDecl *HLSLBufferDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) HLSLBufferDecl(nullptr, false, SourceLocation(), nullptr, SourceLocation(), SourceLocation()); } @@ -5719,7 +5719,7 @@ ImportDecl *ImportDecl::CreateImplicit(ASTContext &C, DeclContext *DC, return Import; } -ImportDecl *ImportDecl::CreateDeserialized(ASTContext &C, unsigned ID, +ImportDecl *ImportDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NumLocations) { return new (C, ID, additionalSizeToAlloc(NumLocations)) ImportDecl(EmptyShell()); @@ -5752,6 +5752,6 @@ ExportDecl *ExportDecl::Create(ASTContext &C, DeclContext *DC, return new (C, DC) ExportDecl(DC, ExportLoc); } -ExportDecl *ExportDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ExportDecl *ExportDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ExportDecl(nullptr, SourceLocation()); } diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 434926324c96ca..7cb6b31c541fd3 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -71,7 +71,7 @@ void Decl::updateOutOfDate(IdentifierInfo &II) const { #include "clang/AST/DeclNodes.inc" void *Decl::operator new(std::size_t Size, const ASTContext &Context, - unsigned ID, std::size_t Extra) { + Decl::DeclID ID, std::size_t Extra) { // Allocate an extra 8 bytes worth of storage, which ensures that the // resulting pointer will still be 8-byte aligned. static_assert(sizeof(unsigned) * 2 >= alignof(Decl), diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 00cc857f5e7379..426c5262051094 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -57,7 +57,7 @@ using namespace clang; void AccessSpecDecl::anchor() {} -AccessSpecDecl *AccessSpecDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +AccessSpecDecl *AccessSpecDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) AccessSpecDecl(EmptyShell()); } @@ -161,7 +161,7 @@ CXXRecordDecl::CreateLambda(const ASTContext &C, DeclContext *DC, } CXXRecordDecl * -CXXRecordDecl::CreateDeserialized(const ASTContext &C, unsigned ID) { +CXXRecordDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) { auto *R = new (C, ID) CXXRecordDecl(CXXRecord, TagTypeKind::Struct, C, nullptr, SourceLocation(), SourceLocation(), nullptr, nullptr); @@ -2163,7 +2163,7 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create( } CXXDeductionGuideDecl *CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) CXXDeductionGuideDecl( C, nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(), QualType(), nullptr, SourceLocation(), nullptr, @@ -2176,7 +2176,7 @@ RequiresExprBodyDecl *RequiresExprBodyDecl::Create( } RequiresExprBodyDecl *RequiresExprBodyDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) RequiresExprBodyDecl(C, nullptr, SourceLocation()); } @@ -2281,7 +2281,7 @@ CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, isInline, ConstexprKind, EndLocation, TrailingRequiresClause); } -CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) CXXMethodDecl( CXXMethod, C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, SC_None, false, false, @@ -2699,7 +2699,7 @@ CXXConstructorDecl::CXXConstructorDecl( void CXXConstructorDecl::anchor() {} CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C, - unsigned ID, + Decl::DeclID ID, uint64_t AllocKind) { bool hasTrailingExplicit = static_cast(AllocKind & TAKHasTailExplicit); bool isInheritingConstructor = @@ -2846,7 +2846,7 @@ bool CXXConstructorDecl::isSpecializationCopyingObject() const { void CXXDestructorDecl::anchor() {} CXXDestructorDecl * -CXXDestructorDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +CXXDestructorDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) CXXDestructorDecl( C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, false, false, false, ConstexprSpecKind::Unspecified, nullptr); @@ -2878,7 +2878,7 @@ void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) { void CXXConversionDecl::anchor() {} CXXConversionDecl * -CXXConversionDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +CXXConversionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) CXXConversionDecl( C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, false, false, ExplicitSpecifier(), ConstexprSpecKind::Unspecified, @@ -2924,7 +2924,7 @@ LinkageSpecDecl *LinkageSpecDecl::Create(ASTContext &C, DeclContext *DC, } LinkageSpecDecl *LinkageSpecDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) LinkageSpecDecl(nullptr, SourceLocation(), SourceLocation(), LinkageSpecLanguageIDs::C, false); @@ -2946,7 +2946,7 @@ UsingDirectiveDecl *UsingDirectiveDecl::Create(ASTContext &C, DeclContext *DC, } UsingDirectiveDecl *UsingDirectiveDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) UsingDirectiveDecl(nullptr, SourceLocation(), SourceLocation(), NestedNameSpecifierLoc(), @@ -2985,7 +2985,7 @@ NamespaceDecl *NamespaceDecl::Create(ASTContext &C, DeclContext *DC, NamespaceDecl(C, DC, Inline, StartLoc, IdLoc, Id, PrevDecl, Nested); } -NamespaceDecl *NamespaceDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +NamespaceDecl *NamespaceDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) NamespaceDecl(C, nullptr, false, SourceLocation(), SourceLocation(), nullptr, nullptr, false); } @@ -3047,7 +3047,7 @@ NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC, } NamespaceAliasDecl * -NamespaceAliasDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +NamespaceAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) NamespaceAliasDecl(C, nullptr, SourceLocation(), SourceLocation(), nullptr, NestedNameSpecifierLoc(), @@ -3103,7 +3103,7 @@ UsingShadowDecl::UsingShadowDecl(Kind K, ASTContext &C, EmptyShell Empty) redeclarable_base(C) {} UsingShadowDecl * -UsingShadowDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UsingShadowDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UsingShadowDecl(UsingShadow, C, EmptyShell()); } @@ -3126,7 +3126,7 @@ ConstructorUsingShadowDecl::Create(ASTContext &C, DeclContext *DC, } ConstructorUsingShadowDecl * -ConstructorUsingShadowDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ConstructorUsingShadowDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ConstructorUsingShadowDecl(C, EmptyShell()); } @@ -3174,7 +3174,7 @@ UsingDecl *UsingDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation UL, return new (C, DC) UsingDecl(DC, UL, QualifierLoc, NameInfo, HasTypename); } -UsingDecl *UsingDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UsingDecl *UsingDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UsingDecl(nullptr, SourceLocation(), NestedNameSpecifierLoc(), DeclarationNameInfo(), false); @@ -3198,7 +3198,7 @@ UsingEnumDecl *UsingEnumDecl::Create(ASTContext &C, DeclContext *DC, UsingEnumDecl(DC, EnumType->getType()->getAsTagDecl()->getDeclName(), UL, EL, NL, EnumType); } -UsingEnumDecl *UsingEnumDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UsingEnumDecl *UsingEnumDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UsingEnumDecl(nullptr, DeclarationName(), SourceLocation(), SourceLocation(), SourceLocation(), nullptr); @@ -3217,7 +3217,7 @@ UsingPackDecl *UsingPackDecl::Create(ASTContext &C, DeclContext *DC, return new (C, DC, Extra) UsingPackDecl(DC, InstantiatedFrom, UsingDecls); } -UsingPackDecl *UsingPackDecl::CreateDeserialized(ASTContext &C, unsigned ID, +UsingPackDecl *UsingPackDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NumExpansions) { size_t Extra = additionalSizeToAlloc(NumExpansions); auto *Result = @@ -3243,7 +3243,7 @@ UnresolvedUsingValueDecl::Create(ASTContext &C, DeclContext *DC, } UnresolvedUsingValueDecl * -UnresolvedUsingValueDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UnresolvedUsingValueDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UnresolvedUsingValueDecl(nullptr, QualType(), SourceLocation(), NestedNameSpecifierLoc(), @@ -3273,7 +3273,7 @@ UnresolvedUsingTypenameDecl::Create(ASTContext &C, DeclContext *DC, } UnresolvedUsingTypenameDecl * -UnresolvedUsingTypenameDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UnresolvedUsingTypenameDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UnresolvedUsingTypenameDecl( nullptr, SourceLocation(), SourceLocation(), NestedNameSpecifierLoc(), SourceLocation(), nullptr, SourceLocation()); @@ -3286,7 +3286,7 @@ UnresolvedUsingIfExistsDecl::Create(ASTContext &Ctx, DeclContext *DC, } UnresolvedUsingIfExistsDecl * -UnresolvedUsingIfExistsDecl::CreateDeserialized(ASTContext &Ctx, unsigned ID) { +UnresolvedUsingIfExistsDecl::CreateDeserialized(ASTContext &Ctx, Decl::DeclID ID) { return new (Ctx, ID) UnresolvedUsingIfExistsDecl(nullptr, SourceLocation(), DeclarationName()); } @@ -3310,7 +3310,7 @@ StaticAssertDecl *StaticAssertDecl::Create(ASTContext &C, DeclContext *DC, } StaticAssertDecl *StaticAssertDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) StaticAssertDecl(nullptr, SourceLocation(), nullptr, nullptr, SourceLocation(), false); } @@ -3332,7 +3332,7 @@ BindingDecl *BindingDecl::Create(ASTContext &C, DeclContext *DC, return new (C, DC) BindingDecl(DC, IdLoc, Id); } -BindingDecl *BindingDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +BindingDecl *BindingDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) BindingDecl(nullptr, SourceLocation(), nullptr); } @@ -3363,7 +3363,7 @@ DecompositionDecl *DecompositionDecl::Create(ASTContext &C, DeclContext *DC, } DecompositionDecl *DecompositionDecl::CreateDeserialized(ASTContext &C, - unsigned ID, + Decl::DeclID ID, unsigned NumBindings) { size_t Extra = additionalSizeToAlloc(NumBindings); auto *Result = new (C, ID, Extra) @@ -3402,7 +3402,7 @@ MSPropertyDecl *MSPropertyDecl::Create(ASTContext &C, DeclContext *DC, } MSPropertyDecl *MSPropertyDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) MSPropertyDecl(nullptr, SourceLocation(), DeclarationName(), QualType(), nullptr, SourceLocation(), nullptr, nullptr); @@ -3419,7 +3419,7 @@ MSGuidDecl *MSGuidDecl::Create(const ASTContext &C, QualType T, Parts P) { return new (C, DC) MSGuidDecl(DC, T, P); } -MSGuidDecl *MSGuidDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +MSGuidDecl *MSGuidDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) MSGuidDecl(nullptr, QualType(), Parts()); } @@ -3529,7 +3529,7 @@ UnnamedGlobalConstantDecl::Create(const ASTContext &C, QualType T, } UnnamedGlobalConstantDecl * -UnnamedGlobalConstantDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +UnnamedGlobalConstantDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) UnnamedGlobalConstantDecl(C, nullptr, QualType(), APValue()); } diff --git a/clang/lib/AST/DeclFriend.cpp b/clang/lib/AST/DeclFriend.cpp index 8ec1dea84df5f1..1fabf8aa80c2bd 100644 --- a/clang/lib/AST/DeclFriend.cpp +++ b/clang/lib/AST/DeclFriend.cpp @@ -62,7 +62,7 @@ FriendDecl *FriendDecl::Create(ASTContext &C, DeclContext *DC, return FD; } -FriendDecl *FriendDecl::CreateDeserialized(ASTContext &C, unsigned ID, +FriendDecl *FriendDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned FriendTypeNumTPLists) { std::size_t Extra = additionalSizeToAlloc(FriendTypeNumTPLists); diff --git a/clang/lib/AST/DeclObjC.cpp b/clang/lib/AST/DeclObjC.cpp index 32c14938cd5888..d4275eea058212 100644 --- a/clang/lib/AST/DeclObjC.cpp +++ b/clang/lib/AST/DeclObjC.cpp @@ -862,7 +862,7 @@ ObjCMethodDecl *ObjCMethodDecl::Create( isImplicitlyDeclared, isDefined, impControl, HasRelatedResultType); } -ObjCMethodDecl *ObjCMethodDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ObjCMethodDecl *ObjCMethodDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ObjCMethodDecl(SourceLocation(), SourceLocation(), Selector(), QualType(), nullptr, nullptr); } @@ -1486,7 +1486,7 @@ ObjCTypeParamDecl *ObjCTypeParamDecl::Create(ASTContext &ctx, DeclContext *dc, } ObjCTypeParamDecl *ObjCTypeParamDecl::CreateDeserialized(ASTContext &ctx, - unsigned ID) { + Decl::DeclID ID) { return new (ctx, ID) ObjCTypeParamDecl(ctx, nullptr, ObjCTypeParamVariance::Invariant, SourceLocation(), 0, SourceLocation(), @@ -1551,7 +1551,7 @@ ObjCInterfaceDecl *ObjCInterfaceDecl::Create( } ObjCInterfaceDecl *ObjCInterfaceDecl::CreateDeserialized(const ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { auto *Result = new (C, ID) ObjCInterfaceDecl(C, nullptr, SourceLocation(), nullptr, nullptr, SourceLocation(), nullptr, false); @@ -1865,7 +1865,7 @@ ObjCIvarDecl *ObjCIvarDecl::Create(ASTContext &C, ObjCContainerDecl *DC, synthesized); } -ObjCIvarDecl *ObjCIvarDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ObjCIvarDecl *ObjCIvarDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ObjCIvarDecl(nullptr, SourceLocation(), SourceLocation(), nullptr, QualType(), nullptr, ObjCIvarDecl::None, nullptr, false); @@ -1914,7 +1914,7 @@ ObjCAtDefsFieldDecl } ObjCAtDefsFieldDecl *ObjCAtDefsFieldDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ObjCAtDefsFieldDecl(nullptr, SourceLocation(), SourceLocation(), nullptr, QualType(), nullptr); @@ -1949,7 +1949,7 @@ ObjCProtocolDecl *ObjCProtocolDecl::Create(ASTContext &C, DeclContext *DC, } ObjCProtocolDecl *ObjCProtocolDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { ObjCProtocolDecl *Result = new (C, ID) ObjCProtocolDecl(C, nullptr, nullptr, SourceLocation(), SourceLocation(), nullptr); @@ -2148,7 +2148,7 @@ ObjCCategoryDecl *ObjCCategoryDecl::Create( } ObjCCategoryDecl *ObjCCategoryDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ObjCCategoryDecl(nullptr, SourceLocation(), SourceLocation(), SourceLocation(), nullptr, nullptr, nullptr); @@ -2189,7 +2189,7 @@ ObjCCategoryImplDecl *ObjCCategoryImplDecl::Create( } ObjCCategoryImplDecl *ObjCCategoryImplDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ObjCCategoryImplDecl(nullptr, nullptr, nullptr, SourceLocation(), SourceLocation(), SourceLocation()); @@ -2296,7 +2296,7 @@ ObjCImplementationDecl::Create(ASTContext &C, DeclContext *DC, } ObjCImplementationDecl * -ObjCImplementationDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ObjCImplementationDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ObjCImplementationDecl(nullptr, nullptr, nullptr, SourceLocation(), SourceLocation()); } @@ -2339,7 +2339,7 @@ ObjCCompatibleAliasDecl::Create(ASTContext &C, DeclContext *DC, } ObjCCompatibleAliasDecl * -ObjCCompatibleAliasDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +ObjCCompatibleAliasDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) ObjCCompatibleAliasDecl(nullptr, SourceLocation(), nullptr, nullptr); } @@ -2360,7 +2360,7 @@ ObjCPropertyDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L, } ObjCPropertyDecl *ObjCPropertyDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ObjCPropertyDecl(nullptr, SourceLocation(), nullptr, SourceLocation(), SourceLocation(), QualType(), nullptr, None); @@ -2393,7 +2393,7 @@ ObjCPropertyImplDecl *ObjCPropertyImplDecl::Create(ASTContext &C, } ObjCPropertyImplDecl *ObjCPropertyImplDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ObjCPropertyImplDecl(nullptr, SourceLocation(), SourceLocation(), nullptr, Dynamic, nullptr, SourceLocation()); diff --git a/clang/lib/AST/DeclOpenMP.cpp b/clang/lib/AST/DeclOpenMP.cpp index ac5780f82dbbb2..b178a15aab5f28 100644 --- a/clang/lib/AST/DeclOpenMP.cpp +++ b/clang/lib/AST/DeclOpenMP.cpp @@ -36,7 +36,7 @@ OMPThreadPrivateDecl *OMPThreadPrivateDecl::Create(ASTContext &C, } OMPThreadPrivateDecl *OMPThreadPrivateDecl::CreateDeserialized(ASTContext &C, - unsigned ID, + Decl::DeclID ID, unsigned N) { return OMPDeclarativeDirective::createEmptyDirective( C, ID, 0, N); @@ -63,7 +63,7 @@ OMPAllocateDecl *OMPAllocateDecl::Create(ASTContext &C, DeclContext *DC, return D; } -OMPAllocateDecl *OMPAllocateDecl::CreateDeserialized(ASTContext &C, unsigned ID, +OMPAllocateDecl *OMPAllocateDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NVars, unsigned NClauses) { return OMPDeclarativeDirective::createEmptyDirective( @@ -89,7 +89,7 @@ OMPRequiresDecl *OMPRequiresDecl::Create(ASTContext &C, DeclContext *DC, L); } -OMPRequiresDecl *OMPRequiresDecl::CreateDeserialized(ASTContext &C, unsigned ID, +OMPRequiresDecl *OMPRequiresDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned N) { return OMPDeclarativeDirective::createEmptyDirective( C, ID, N, 0, SourceLocation()); @@ -117,7 +117,7 @@ OMPDeclareReductionDecl *OMPDeclareReductionDecl::Create( } OMPDeclareReductionDecl * -OMPDeclareReductionDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +OMPDeclareReductionDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) OMPDeclareReductionDecl( OMPDeclareReduction, /*DC=*/nullptr, SourceLocation(), DeclarationName(), QualType(), /*PrevDeclInScope=*/nullptr); @@ -148,7 +148,7 @@ OMPDeclareMapperDecl *OMPDeclareMapperDecl::Create( } OMPDeclareMapperDecl *OMPDeclareMapperDecl::CreateDeserialized(ASTContext &C, - unsigned ID, + Decl::DeclID ID, unsigned N) { return OMPDeclarativeDirective::createEmptyDirective( C, ID, N, 1, SourceLocation(), DeclarationName(), QualType(), @@ -179,7 +179,7 @@ OMPCapturedExprDecl *OMPCapturedExprDecl::Create(ASTContext &C, DeclContext *DC, } OMPCapturedExprDecl *OMPCapturedExprDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) OMPCapturedExprDecl(C, nullptr, nullptr, QualType(), /*TInfo=*/nullptr, SourceLocation()); } diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 5aa2484197372b..0ba271c3e04ee5 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -418,7 +418,7 @@ FunctionTemplateDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L, } FunctionTemplateDecl *FunctionTemplateDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) FunctionTemplateDecl(C, nullptr, SourceLocation(), DeclarationName(), nullptr, nullptr); } @@ -503,7 +503,7 @@ ClassTemplateDecl *ClassTemplateDecl::Create(ASTContext &C, DeclContext *DC, } ClassTemplateDecl *ClassTemplateDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) ClassTemplateDecl(C, nullptr, SourceLocation(), DeclarationName(), nullptr, nullptr); } @@ -652,14 +652,14 @@ TemplateTypeParmDecl *TemplateTypeParmDecl::Create( } TemplateTypeParmDecl * -TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, unsigned ID) { +TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID) { return new (C, ID) TemplateTypeParmDecl(nullptr, SourceLocation(), SourceLocation(), nullptr, false, false, std::nullopt); } TemplateTypeParmDecl * -TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, unsigned ID, +TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, Decl::DeclID ID, bool HasTypeConstraint) { return new (C, ID, additionalSizeToAlloc(HasTypeConstraint ? 1 : 0)) @@ -759,7 +759,7 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( } NonTypeTemplateParmDecl * -NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID, +NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, bool HasTypeConstraint) { return new (C, ID, additionalSizeToAlloc, @@ -770,7 +770,7 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID, } NonTypeTemplateParmDecl * -NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID, +NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NumExpandedTypes, bool HasTypeConstraint) { auto *NTTP = @@ -836,13 +836,13 @@ TemplateTemplateParmDecl::Create(const ASTContext &C, DeclContext *DC, } TemplateTemplateParmDecl * -TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) TemplateTemplateParmDecl(nullptr, SourceLocation(), 0, 0, false, nullptr, false, nullptr); } TemplateTemplateParmDecl * -TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, unsigned ID, +TemplateTemplateParmDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID, unsigned NumExpansions) { auto *TTP = new (C, ID, additionalSizeToAlloc(NumExpansions)) @@ -949,7 +949,7 @@ ClassTemplateSpecializationDecl::Create(ASTContext &Context, TagKind TK, ClassTemplateSpecializationDecl * ClassTemplateSpecializationDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { auto *Result = new (C, ID) ClassTemplateSpecializationDecl(C, ClassTemplateSpecialization); Result->setMayHaveOutOfDateDef(false); @@ -1036,7 +1036,7 @@ ConceptDecl *ConceptDecl::Create(ASTContext &C, DeclContext *DC, } ConceptDecl *ConceptDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { ConceptDecl *Result = new (C, ID) ConceptDecl(nullptr, SourceLocation(), DeclarationName(), nullptr, nullptr); @@ -1070,7 +1070,7 @@ ImplicitConceptSpecializationDecl *ImplicitConceptSpecializationDecl::Create( ImplicitConceptSpecializationDecl * ImplicitConceptSpecializationDecl::CreateDeserialized( - const ASTContext &C, unsigned ID, unsigned NumTemplateArgs) { + const ASTContext &C, Decl::DeclID ID, unsigned NumTemplateArgs) { return new (C, ID, additionalSizeToAlloc(NumTemplateArgs)) ImplicitConceptSpecializationDecl(EmptyShell{}, NumTemplateArgs); } @@ -1133,7 +1133,7 @@ Create(ASTContext &Context, TagKind TK,DeclContext *DC, ClassTemplatePartialSpecializationDecl * ClassTemplatePartialSpecializationDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { auto *Result = new (C, ID) ClassTemplatePartialSpecializationDecl(C); Result->setMayHaveOutOfDateDef(false); return Result; @@ -1160,7 +1160,7 @@ FriendTemplateDecl::Create(ASTContext &Context, DeclContext *DC, } FriendTemplateDecl *FriendTemplateDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) FriendTemplateDecl(EmptyShell()); } @@ -1180,7 +1180,7 @@ TypeAliasTemplateDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L, } TypeAliasTemplateDecl *TypeAliasTemplateDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) TypeAliasTemplateDecl(C, nullptr, SourceLocation(), DeclarationName(), nullptr, nullptr); } @@ -1218,7 +1218,7 @@ VarTemplateDecl *VarTemplateDecl::Create(ASTContext &C, DeclContext *DC, } VarTemplateDecl *VarTemplateDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) VarTemplateDecl(C, nullptr, SourceLocation(), DeclarationName(), nullptr, nullptr); } @@ -1340,7 +1340,7 @@ VarTemplateSpecializationDecl *VarTemplateSpecializationDecl::Create( } VarTemplateSpecializationDecl * -VarTemplateSpecializationDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +VarTemplateSpecializationDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { return new (C, ID) VarTemplateSpecializationDecl(VarTemplateSpecialization, C); } @@ -1432,7 +1432,7 @@ VarTemplatePartialSpecializationDecl::Create( VarTemplatePartialSpecializationDecl * VarTemplatePartialSpecializationDecl::CreateDeserialized(ASTContext &C, - unsigned ID) { + Decl::DeclID ID) { return new (C, ID) VarTemplatePartialSpecializationDecl(C); } @@ -1546,7 +1546,7 @@ TemplateParamObjectDecl *TemplateParamObjectDecl::Create(const ASTContext &C, } TemplateParamObjectDecl * -TemplateParamObjectDecl::CreateDeserialized(ASTContext &C, unsigned ID) { +TemplateParamObjectDecl::CreateDeserialized(ASTContext &C, Decl::DeclID ID) { auto *TPOD = new (C, ID) TemplateParamObjectDecl(nullptr, QualType(), APValue()); C.addDestruction(&TPOD->Value); return TPOD; diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index dd0bacd73acb10..cebedf59e0593f 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1548,17 +1548,16 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, if (!CheckArray(S, OpPC, Ptr)) return false; - // Get a version of the index comparable to the type. - T Index = T::from(Ptr.getIndex(), Offset.bitWidth()); - // Compute the largest index into the array. - T MaxIndex = T::from(Ptr.getNumElems(), Offset.bitWidth()); + uint64_t Index = Ptr.getIndex(); + uint64_t MaxIndex = static_cast(Ptr.getNumElems()); bool Invalid = false; // Helper to report an invalid offset, computed as APSInt. auto DiagInvalidOffset = [&]() -> void { const unsigned Bits = Offset.bitWidth(); - APSInt APOffset(Offset.toAPSInt().extend(Bits + 2), false); - APSInt APIndex(Index.toAPSInt().extend(Bits + 2), false); + APSInt APOffset(Offset.toAPSInt().extend(Bits + 2), /*IsUnsigend=*/false); + APSInt APIndex(APInt(Bits + 2, Index, /*IsSigned=*/true), + /*IsUnsigned=*/false); APSInt NewIndex = (Op == ArithOp::Add) ? (APIndex + APOffset) : (APIndex - APOffset); S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_array_index) @@ -1569,22 +1568,24 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, }; if (Ptr.isBlockPointer()) { - T MaxOffset = T::from(MaxIndex - Index, Offset.bitWidth()); + uint64_t IOffset = static_cast(Offset); + uint64_t MaxOffset = MaxIndex - Index; + if constexpr (Op == ArithOp::Add) { // If the new offset would be negative, bail out. - if (Offset.isNegative() && (Offset.isMin() || -Offset > Index)) + if (Offset.isNegative() && (Offset.isMin() || -IOffset > Index)) DiagInvalidOffset(); // If the new offset would be out of bounds, bail out. - if (Offset.isPositive() && Offset > MaxOffset) + if (Offset.isPositive() && IOffset > MaxOffset) DiagInvalidOffset(); } else { // If the new offset would be negative, bail out. - if (Offset.isPositive() && Index < Offset) + if (Offset.isPositive() && Index < IOffset) DiagInvalidOffset(); // If the new offset would be out of bounds, bail out. - if (Offset.isNegative() && (Offset.isMin() || -Offset > MaxOffset)) + if (Offset.isNegative() && (Offset.isMin() || -IOffset > MaxOffset)) DiagInvalidOffset(); } } @@ -1601,7 +1602,7 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, else Result = WideIndex - WideOffset; - S.Stk.push(Ptr.atIndex(static_cast(Result))); + S.Stk.push(Ptr.atIndex(static_cast(Result))); return true; } diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp index f562f9e1cb19fb..565c85bc2e0c21 100644 --- a/clang/lib/AST/Interp/InterpBuiltin.cpp +++ b/clang/lib/AST/Interp/InterpBuiltin.cpp @@ -9,6 +9,7 @@ #include "Boolean.h" #include "Interp.h" #include "PrimType.h" +#include "clang/AST/OSLog.h" #include "clang/AST/RecordLayout.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/TargetInfo.h" @@ -1088,6 +1089,17 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, return false; } +static bool interp__builtin_os_log_format_buffer_size(InterpState &S, + CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + analyze_os_log::OSLogBufferLayout Layout; + analyze_os_log::computeOSLogBufferLayout(S.getCtx(), Call, Layout); + pushInteger(S, Layout.size().getQuantity(), Call->getType()); + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, const CallExpr *Call) { const InterpFrame *Frame = S.Current; @@ -1409,6 +1421,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case Builtin::BI__builtin_os_log_format_buffer_size: + if (!interp__builtin_os_log_format_buffer_size(S, OpPC, Frame, F, Call)) + return false; + break; + default: S.FFDiag(S.Current->getLocation(OpPC), diag::note_invalid_subexpr_in_const_expr) diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp index e163e658d462b2..5ef31671ae7be5 100644 --- a/clang/lib/AST/Interp/Pointer.cpp +++ b/clang/lib/AST/Interp/Pointer.cpp @@ -23,7 +23,7 @@ Pointer::Pointer(Block *Pointee) : Pointer(Pointee, Pointee->getDescriptor()->getMetadataSize(), Pointee->getDescriptor()->getMetadataSize()) {} -Pointer::Pointer(Block *Pointee, unsigned BaseAndOffset) +Pointer::Pointer(Block *Pointee, uint64_t BaseAndOffset) : Pointer(Pointee, BaseAndOffset, BaseAndOffset) {} Pointer::Pointer(const Pointer &P) @@ -34,7 +34,7 @@ Pointer::Pointer(const Pointer &P) PointeeStorage.BS.Pointee->addPointer(this); } -Pointer::Pointer(Block *Pointee, unsigned Base, unsigned Offset) +Pointer::Pointer(Block *Pointee, unsigned Base, uint64_t Offset) : Offset(Offset), StorageKind(Storage::Block) { assert((Base == RootPtrMark || Base % alignof(void *) == 0) && "wrong base"); diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index b4475577b74625..c4d701bc71b7bf 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -89,10 +89,10 @@ class Pointer { PointeeStorage.Int.Desc = nullptr; } Pointer(Block *B); - Pointer(Block *B, unsigned BaseAndOffset); + Pointer(Block *B, uint64_t BaseAndOffset); Pointer(const Pointer &P); Pointer(Pointer &&P); - Pointer(uint64_t Address, const Descriptor *Desc, unsigned Offset = 0) + Pointer(uint64_t Address, const Descriptor *Desc, uint64_t Offset = 0) : Offset(Offset), StorageKind(Storage::Int) { PointeeStorage.Int.Value = Address; PointeeStorage.Int.Desc = Desc; @@ -134,14 +134,14 @@ class Pointer { std::optional toRValue(const Context &Ctx) const; /// Offsets a pointer inside an array. - [[nodiscard]] Pointer atIndex(unsigned Idx) const { + [[nodiscard]] Pointer atIndex(uint64_t Idx) const { if (isIntegralPointer()) return Pointer(asIntPointer().Value, asIntPointer().Desc, Idx); if (asBlockPointer().Base == RootPtrMark) return Pointer(asBlockPointer().Pointee, RootPtrMark, getDeclDesc()->getSize()); - unsigned Off = Idx * elemSize(); + uint64_t Off = Idx * elemSize(); if (getFieldDesc()->ElemDesc) Off += sizeof(InlineDescriptor); else @@ -630,7 +630,7 @@ class Pointer { friend class DeadBlock; friend struct InitMap; - Pointer(Block *Pointee, unsigned Base, unsigned Offset); + Pointer(Block *Pointee, unsigned Base, uint64_t Offset); /// Returns the embedded descriptor preceding a field. InlineDescriptor *getInlineDesc() const { @@ -656,7 +656,7 @@ class Pointer { } /// Offset into the storage. - unsigned Offset = 0; + uint64_t Offset = 0; /// Previous link in the pointer chain. Pointer *Prev = nullptr; diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index c3b98d2d2149cb..106c69dd5beed7 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1062,26 +1062,23 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD, // ::= // const DeclContext *DC = Context.getEffectiveDeclContext(ND); + bool IsLambda = isLambda(ND); // If this is an extern variable declared locally, the relevant DeclContext // is that of the containing namespace, or the translation unit. // FIXME: This is a hack; extern variables declared locally should have // a proper semantic declaration context! - if (isLocalContainerContext(DC) && ND->hasLinkage() && !isLambda(ND)) + if (isLocalContainerContext(DC) && ND->hasLinkage() && !IsLambda) while (!DC->isNamespace() && !DC->isTranslationUnit()) DC = Context.getEffectiveParentContext(DC); - else if (GetLocalClassDecl(ND)) { + else if (GetLocalClassDecl(ND) && + (!IsLambda || isCompatibleWith(LangOptions::ClangABI::Ver18))) { mangleLocalName(GD, AdditionalAbiTags); return; } assert(!isa(DC) && "context cannot be LinkageSpecDecl"); - if (isLocalContainerContext(DC)) { - mangleLocalName(GD, AdditionalAbiTags); - return; - } - // Closures can require a nested-name mangling even if they're semantically // in the global namespace. if (const NamedDecl *PrefixND = getClosurePrefix(ND)) { @@ -1089,6 +1086,11 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD, return; } + if (isLocalContainerContext(DC)) { + mangleLocalName(GD, AdditionalAbiTags); + return; + } + if (DC->isTranslationUnit() || isStdNamespace(DC)) { // Check if we have a template. const TemplateArgumentList *TemplateArgs = nullptr; @@ -2201,8 +2203,6 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) { if (NoFunction && isLocalContainerContext(DC)) return; - assert(!isLocalContainerContext(DC)); - const NamedDecl *ND = cast(DC); if (mangleSubstitution(ND)) return; diff --git a/clang/lib/ASTMatchers/Dynamic/Marshallers.h b/clang/lib/ASTMatchers/Dynamic/Marshallers.h index c76ddf17b719d4..0e640cbada7268 100644 --- a/clang/lib/ASTMatchers/Dynamic/Marshallers.h +++ b/clang/lib/ASTMatchers/Dynamic/Marshallers.h @@ -937,7 +937,7 @@ class MapAnyOfMatcherDescriptor : public MatcherDescriptor { public: MapAnyOfMatcherDescriptor(ASTNodeKind CladeNodeKind, std::vector NodeKinds) - : CladeNodeKind(CladeNodeKind), NodeKinds(NodeKinds) {} + : CladeNodeKind(CladeNodeKind), NodeKinds(std::move(NodeKinds)) {} VariantMatcher create(SourceRange NameRange, ArrayRef Args, Diagnostics *Error) const override { @@ -1026,7 +1026,7 @@ class MapAnyOfBuilderDescriptor : public MatcherDescriptor { } return std::make_unique(CladeNodeKind, - NodeKinds); + std::move(NodeKinds)); } bool isVariadic() const override { return true; } diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp index 6f179c1403b6f5..619bf772bba5ee 100644 --- a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp +++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp @@ -261,4 +261,10 @@ ReferencedDecls getReferencedDecls(const FunctionDecl &FD) { return Result; } +ReferencedDecls getReferencedDecls(const Stmt &S) { + ReferencedDecls Result; + getReferencedDecls(S, Result); + return Result; +} + } // namespace clang::dataflow diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index 6998e6d7a5170b..05395e07a7a68c 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -400,8 +400,8 @@ class ResultObjectVisitor : public RecursiveASTVisitor { // Fields of non-record type are handled in // `TransferVisitor::VisitInitListExpr()`. if (Field->getType()->isRecordType()) - PropagateResultObject(Init, - cast(Loc->getChild(*Field))); + PropagateResultObject( + Init, cast(Loc->getChild(*Field))); } } @@ -610,8 +610,8 @@ Environment Environment::pushCall(const CallExpr *Call) const { if (const auto *MethodCall = dyn_cast(Call)) { if (const Expr *Arg = MethodCall->getImplicitObjectArgument()) { if (!isa(Arg)) - Env.ThisPointeeLoc = - cast(getStorageLocation(*Arg)); + Env.ThisPointeeLoc = + cast(getStorageLocation(*Arg)); // Otherwise (when the argument is `this`), retain the current // environment's `ThisPointeeLoc`. } @@ -1083,7 +1083,7 @@ StorageLocation &Environment::createObjectInternal(const ValueDecl *D, // be null. if (InitExpr) { if (auto *InitExprLoc = getStorageLocation(*InitExpr)) - return *InitExprLoc; + return *InitExprLoc; } // Even though we have an initializer, we might not get an @@ -1191,9 +1191,7 @@ void Environment::dump(raw_ostream &OS) const { DACtx->dumpFlowCondition(FlowConditionToken, OS); } -void Environment::dump() const { - dump(llvm::dbgs()); -} +void Environment::dump() const { dump(llvm::dbgs()); } Environment::PrValueToResultObject Environment::buildResultObjectMap( DataflowAnalysisContext *DACtx, const FunctionDecl *FuncDecl, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 6cc00b85664f41..22c3f8642ad8eb 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -104,6 +104,21 @@ static cl::opt ClSanitizeOnOptimizerEarlyEP( "sanitizer-early-opt-ep", cl::Optional, cl::desc("Insert sanitizers on OptimizerEarlyEP.")); +// Experiment to mark cold functions as optsize/minsize/optnone. +// TODO: remove once this is exposed as a proper driver flag. +static cl::opt ClPGOColdFuncAttr( + "pgo-cold-func-opt", cl::init(PGOOptions::ColdFuncOpt::Default), cl::Hidden, + cl::desc( + "Function attribute to apply to cold functions as determined by PGO"), + cl::values(clEnumValN(PGOOptions::ColdFuncOpt::Default, "default", + "Default (no attribute)"), + clEnumValN(PGOOptions::ColdFuncOpt::OptSize, "optsize", + "Mark cold functions with optsize."), + clEnumValN(PGOOptions::ColdFuncOpt::MinSize, "minsize", + "Mark cold functions with minsize."), + clEnumValN(PGOOptions::ColdFuncOpt::OptNone, "optnone", + "Mark cold functions with optnone."))); + extern cl::opt ProfileCorrelate; // Re-link builtin bitcodes after optimization @@ -768,42 +783,41 @@ void EmitAssemblyHelper::RunOptimizationPipeline( CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName() : CodeGenOpts.InstrProfileOutput, "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr, - PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate); else if (CodeGenOpts.hasProfileIRUse()) { // -fprofile-use. auto CSAction = CodeGenOpts.hasProfileCSIRUse() ? PGOOptions::CSIRUse : PGOOptions::NoCSAction; - PGOOpt = PGOOptions( - CodeGenOpts.ProfileInstrumentUsePath, "", - CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS, - PGOOptions::IRUse, CSAction, PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + PGOOpt = PGOOptions(CodeGenOpts.ProfileInstrumentUsePath, "", + CodeGenOpts.ProfileRemappingFile, + CodeGenOpts.MemoryProfileUsePath, VFS, + PGOOptions::IRUse, CSAction, ClPGOColdFuncAttr, + CodeGenOpts.DebugInfoForProfiling); } else if (!CodeGenOpts.SampleProfileFile.empty()) // -fprofile-sample-use PGOOpt = PGOOptions( CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse, - PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling); else if (!CodeGenOpts.MemoryProfileUsePath.empty()) // -fmemory-profile-use (without any of the above options) PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); else if (CodeGenOpts.PseudoProbeForProfiling) // -fpseudo-probe-for-profiling - PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling, true); + PGOOpt = + PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, + PGOOptions::NoAction, PGOOptions::NoCSAction, + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, true); else if (CodeGenOpts.DebugInfoForProfiling) // -fdebug-info-for-profiling PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, true); + ClPGOColdFuncAttr, true); // Check to see if we want to generate a CS profile. if (CodeGenOpts.hasProfileCSIRInstr()) { @@ -820,14 +834,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline( : CodeGenOpts.InstrProfileOutput; PGOOpt->CSAction = PGOOptions::CSIRInstr; } else - PGOOpt = - PGOOptions("", - CodeGenOpts.InstrProfileOutput.empty() - ? getDefaultProfileGenName() - : CodeGenOpts.InstrProfileOutput, - "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, - PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + PGOOpt = PGOOptions("", + CodeGenOpts.InstrProfileOutput.empty() + ? getDefaultProfileGenName() + : CodeGenOpts.InstrProfileOutput, + "", /*MemoryProfile=*/"", nullptr, + PGOOptions::NoAction, PGOOptions::CSIRInstr, + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); } if (TM) TM->setPGOOption(PGOOpt); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index a05874e63c73c2..afe2de5d00ac5d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -826,29 +826,32 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField( ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) { const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel = getLangOpts().getStrictFlexArraysLevel(); - unsigned FieldNo = 0; - bool IsUnion = RD->isUnion(); + uint32_t FieldNo = 0; - for (const Decl *D : RD->decls()) { - if (const auto *Field = dyn_cast(D); - Field && (Name.empty() || Field->getNameAsString() == Name) && + if (RD->isImplicit()) + return nullptr; + + for (const FieldDecl *FD : RD->fields()) { + if ((Name.empty() || FD->getNameAsString() == Name) && Decl::isFlexibleArrayMemberLike( - Ctx, Field, Field->getType(), StrictFlexArraysLevel, + Ctx, FD, FD->getType(), StrictFlexArraysLevel, /*IgnoreTemplateOrMacroSubstitution=*/true)) { const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); Offset += Layout.getFieldOffset(FieldNo); - return Field; + return FD; } - if (const auto *Record = dyn_cast(D)) - if (const FieldDecl *Field = - FindFlexibleArrayMemberField(Ctx, Record, Name, Offset)) { + QualType Ty = FD->getType(); + if (Ty->isRecordType()) { + if (const FieldDecl *Field = FindFlexibleArrayMemberField( + Ctx, Ty->getAsRecordDecl(), Name, Offset)) { const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD); Offset += Layout.getFieldOffset(FieldNo); return Field; } + } - if (!IsUnion && isa(D)) + if (!RD->isUnion()) ++FieldNo; } @@ -858,14 +861,13 @@ const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField( static unsigned CountCountedByAttrs(const RecordDecl *RD) { unsigned Num = 0; - for (const Decl *D : RD->decls()) { - if (const auto *FD = dyn_cast(D); - FD && FD->getType()->isCountAttributedType()) { + for (const FieldDecl *FD : RD->fields()) { + if (FD->getType()->isCountAttributedType()) return ++Num; - } - if (const auto *Rec = dyn_cast(D)) - Num += CountCountedByAttrs(Rec); + QualType Ty = FD->getType(); + if (Ty->isRecordType()) + Num += CountCountedByAttrs(Ty->getAsRecordDecl()); } return Num; @@ -18294,20 +18296,28 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, Value *M = EmitScalarExpr(E->getArg(0)); Value *A = EmitScalarExpr(E->getArg(1)); Value *B = EmitScalarExpr(E->getArg(2)); - if (E->getArg(0)->getType()->hasFloatingRepresentation()) { + if (E->getArg(0)->getType()->hasFloatingRepresentation()) return Builder.CreateIntrinsic( /*ReturnType*/ M->getType(), Intrinsic::fmuladd, - ArrayRef{M, A, B}, nullptr, "dx.fmad"); - } + ArrayRef{M, A, B}, nullptr, "hlsl.fmad"); + if (E->getArg(0)->getType()->hasSignedIntegerRepresentation()) { - return Builder.CreateIntrinsic( - /*ReturnType*/ M->getType(), Intrinsic::dx_imad, - ArrayRef{M, A, B}, nullptr, "dx.imad"); + if (CGM.getTarget().getTriple().getArch() == llvm::Triple::dxil) + return Builder.CreateIntrinsic( + /*ReturnType*/ M->getType(), Intrinsic::dx_imad, + ArrayRef{M, A, B}, nullptr, "dx.imad"); + + Value *Mul = Builder.CreateNSWMul(M, A); + return Builder.CreateNSWAdd(Mul, B); } assert(E->getArg(0)->getType()->hasUnsignedIntegerRepresentation()); - return Builder.CreateIntrinsic( - /*ReturnType=*/M->getType(), Intrinsic::dx_umad, - ArrayRef{M, A, B}, nullptr, "dx.umad"); + if (CGM.getTarget().getTriple().getArch() == llvm::Triple::dxil) + return Builder.CreateIntrinsic( + /*ReturnType=*/M->getType(), Intrinsic::dx_umad, + ArrayRef{M, A, B}, nullptr, "dx.umad"); + + Value *Mul = Builder.CreateNUWMul(M, A); + return Builder.CreateNUWAdd(Mul, B); } case Builtin::BI__builtin_hlsl_elementwise_rcp: { Value *Op0 = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp index 77c89356bc76bb..8cc82a0ee71687 100644 --- a/clang/lib/Driver/OffloadBundler.cpp +++ b/clang/lib/Driver/OffloadBundler.cpp @@ -1010,6 +1010,10 @@ CompressedOffloadBundle::compress(llvm::compression::Params P, uint16_t CompressionMethod = static_cast(P.format); uint32_t UncompressedSize = Input.getBuffer().size(); + uint32_t TotalFileSize = MagicNumber.size() + sizeof(TotalFileSize) + + sizeof(Version) + sizeof(CompressionMethod) + + sizeof(UncompressedSize) + sizeof(TruncatedHash) + + CompressedBuffer.size(); SmallVector FinalBuffer; llvm::raw_svector_ostream OS(FinalBuffer); @@ -1017,6 +1021,8 @@ CompressedOffloadBundle::compress(llvm::compression::Params P, OS.write(reinterpret_cast(&Version), sizeof(Version)); OS.write(reinterpret_cast(&CompressionMethod), sizeof(CompressionMethod)); + OS.write(reinterpret_cast(&TotalFileSize), + sizeof(TotalFileSize)); OS.write(reinterpret_cast(&UncompressedSize), sizeof(UncompressedSize)); OS.write(reinterpret_cast(&TruncatedHash), @@ -1034,6 +1040,8 @@ CompressedOffloadBundle::compress(llvm::compression::Params P, (UncompressedSize / (1024.0 * 1024.0)) / CompressionTimeSeconds; llvm::errs() << "Compressed bundle format version: " << Version << "\n" + << "Total file size (including headers): " + << formatWithCommas(TotalFileSize) << " bytes\n" << "Compression method used: " << MethodUsed << "\n" << "Compression level: " << P.level << "\n" << "Binary size before compression: " @@ -1059,9 +1067,9 @@ CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, StringRef Blob = Input.getBuffer(); - if (Blob.size() < HeaderSize) { + if (Blob.size() < V1HeaderSize) return llvm::MemoryBuffer::getMemBufferCopy(Blob); - } + if (llvm::identify_magic(Blob) != llvm::file_magic::offload_bundle_compressed) { if (Verbose) @@ -1069,21 +1077,32 @@ CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, return llvm::MemoryBuffer::getMemBufferCopy(Blob); } + size_t CurrentOffset = MagicSize; + uint16_t ThisVersion; + memcpy(&ThisVersion, Blob.data() + CurrentOffset, sizeof(uint16_t)); + CurrentOffset += VersionFieldSize; + uint16_t CompressionMethod; + memcpy(&CompressionMethod, Blob.data() + CurrentOffset, sizeof(uint16_t)); + CurrentOffset += MethodFieldSize; + + uint32_t TotalFileSize; + if (ThisVersion >= 2) { + if (Blob.size() < V2HeaderSize) + return createStringError(inconvertibleErrorCode(), + "Compressed bundle header size too small"); + memcpy(&TotalFileSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); + CurrentOffset += FileSizeFieldSize; + } + uint32_t UncompressedSize; + memcpy(&UncompressedSize, Blob.data() + CurrentOffset, sizeof(uint32_t)); + CurrentOffset += UncompressedSizeFieldSize; + uint64_t StoredHash; - memcpy(&ThisVersion, Input.getBuffer().data() + MagicNumber.size(), - sizeof(uint16_t)); - memcpy(&CompressionMethod, Blob.data() + MagicSize + VersionFieldSize, - sizeof(uint16_t)); - memcpy(&UncompressedSize, - Blob.data() + MagicSize + VersionFieldSize + MethodFieldSize, - sizeof(uint32_t)); - memcpy(&StoredHash, - Blob.data() + MagicSize + VersionFieldSize + MethodFieldSize + - SizeFieldSize, - sizeof(uint64_t)); + memcpy(&StoredHash, Blob.data() + CurrentOffset, sizeof(uint64_t)); + CurrentOffset += HashFieldSize; llvm::compression::Format CompressionFormat; if (CompressionMethod == @@ -1102,7 +1121,7 @@ CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, DecompressTimer.startTimer(); SmallVector DecompressedData; - StringRef CompressedData = Blob.substr(HeaderSize); + StringRef CompressedData = Blob.substr(CurrentOffset); if (llvm::Error DecompressionError = llvm::compression::decompress( CompressionFormat, llvm::arrayRefFromStringRef(CompressedData), DecompressedData, UncompressedSize)) @@ -1135,8 +1154,11 @@ CompressedOffloadBundle::decompress(const llvm::MemoryBuffer &Input, double DecompressionSpeedMBs = (UncompressedSize / (1024.0 * 1024.0)) / DecompressionTimeSeconds; - llvm::errs() << "Compressed bundle format version: " << ThisVersion << "\n" - << "Decompression method: " + llvm::errs() << "Compressed bundle format version: " << ThisVersion << "\n"; + if (ThisVersion >= 2) + llvm::errs() << "Total file size (from header): " + << formatWithCommas(TotalFileSize) << " bytes\n"; + llvm::errs() << "Decompression method: " << (CompressionFormat == llvm::compression::Format::Zlib ? "zlib" : "zstd") diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 456ea74caadb00..97b4aa1c9b1d0a 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4733,7 +4733,7 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T, Output.getFilename()); } -static void ProcessVSRuntimeLibrary(const ArgList &Args, +static void ProcessVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { unsigned RTOptionID = options::OPT__SLASH_MT; @@ -4796,6 +4796,12 @@ static void ProcessVSRuntimeLibrary(const ArgList &Args, // implemented in clang. CmdArgs.push_back("--dependent-lib=oldnames"); } + + // All Arm64EC object files implicitly add softintrin.lib. This is necessary + // even if the file doesn't actually refer to any of the routines because + // the CRT itself has incomplete dependency markings. + if (TC.getTriple().isWindowsArm64EC()) + CmdArgs.push_back("--dependent-lib=softintrin"); } void Clang::ConstructJob(Compilation &C, const JobAction &JA, @@ -7051,7 +7057,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (Triple.isWindowsMSVCEnvironment() && !D.IsCLMode() && Args.hasArg(options::OPT_fms_runtime_lib_EQ)) - ProcessVSRuntimeLibrary(Args, CmdArgs); + ProcessVSRuntimeLibrary(getToolChain(), Args, CmdArgs); // Handle -fgcc-version, if present. VersionTuple GNUCVer; @@ -8178,7 +8184,7 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType, ArgStringList &CmdArgs) const { bool isNVPTX = getToolChain().getTriple().isNVPTX(); - ProcessVSRuntimeLibrary(Args, CmdArgs); + ProcessVSRuntimeLibrary(getToolChain(), Args, CmdArgs); if (Arg *ShowIncludes = Args.getLastArg(options::OPT__SLASH_showIncludes, diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index a679683077ac94..cdfb4256e41d93 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1543,6 +1543,7 @@ class AnnotatingParser { return false; if (Line.MustBeDeclaration && Contexts.size() == 1 && !Contexts.back().IsExpression && !Line.startsWith(TT_ObjCProperty) && + !Line.startsWith(tok::l_paren) && !Tok->isOneOf(TT_TypeDeclarationParen, TT_RequiresExpressionLParen)) { if (const auto *Previous = Tok->Previous; !Previous || @@ -2726,8 +2727,10 @@ class AnnotatingParser { } } - if (Tok.Next->isOneOf(tok::question, tok::ampamp)) + if (Tok.Next->is(tok::question) || + (Tok.Next->is(tok::ampamp) && !Tok.Previous->isTypeName(IsCpp))) { return false; + } // `foreach((A a, B b) in someList)` should not be seen as a cast. if (Tok.Next->is(Keywords.kw_in) && Style.isCSharp()) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 5531e938e0f4f4..8236051e30c4a5 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3660,6 +3660,9 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, case LangOptions::ClangABI::Ver17: GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "17.0"); break; + case LangOptions::ClangABI::Ver18: + GenerateArg(Consumer, OPT_fclang_abi_compat_EQ, "18.0"); + break; case LangOptions::ClangABI::Latest: break; } @@ -4167,6 +4170,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Opts.setClangABICompat(LangOptions::ClangABI::Ver15); else if (Major <= 17) Opts.setClangABICompat(LangOptions::ClangABI::Ver17); + else if (Major <= 18) + Opts.setClangABICompat(LangOptions::ClangABI::Ver18); } else if (Ver != "latest") { Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << A->getValue(); diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index be7a0b247e03d4..4983f331137001 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -840,6 +840,7 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c) /// Copies the values stored in a 128-bit vector of [4 x float] as /// specified by the 128-bit integer vector operand. +/// /// \headerfile /// /// This intrinsic corresponds to the VPERMILPS instruction. diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp index 5eec2a2fd6d1a6..ef90fe9e6f5451 100644 --- a/clang/lib/Interpreter/IncrementalParser.cpp +++ b/clang/lib/Interpreter/IncrementalParser.cpp @@ -209,6 +209,10 @@ IncrementalParser::IncrementalParser(Interpreter &Interp, if (Err) return; CI->ExecuteAction(*Act); + + if (getCodeGen()) + CachedInCodeGenModule = GenModule(); + std::unique_ptr IncrConsumer = std::make_unique(Interp, CI->takeASTConsumer()); CI->setASTConsumer(std::move(IncrConsumer)); @@ -224,11 +228,8 @@ IncrementalParser::IncrementalParser(Interpreter &Interp, return; // PTU.takeError(); } - if (CodeGenerator *CG = getCodeGen()) { - std::unique_ptr M(CG->ReleaseModule()); - CG->StartModule("incr_module_" + std::to_string(PTUs.size()), - M->getContext()); - PTU->TheModule = std::move(M); + if (getCodeGen()) { + PTU->TheModule = GenModule(); assert(PTU->TheModule && "Failed to create initial PTU"); } } @@ -364,6 +365,19 @@ IncrementalParser::Parse(llvm::StringRef input) { std::unique_ptr IncrementalParser::GenModule() { static unsigned ID = 0; if (CodeGenerator *CG = getCodeGen()) { + // Clang's CodeGen is designed to work with a single llvm::Module. In many + // cases for convenience various CodeGen parts have a reference to the + // llvm::Module (TheModule or Module) which does not change when a new + // module is pushed. However, the execution engine wants to take ownership + // of the module which does not map well to CodeGen's design. To work this + // around we created an empty module to make CodeGen happy. We should make + // sure it always stays empty. + assert((!CachedInCodeGenModule || + (CachedInCodeGenModule->empty() && + CachedInCodeGenModule->global_empty() && + CachedInCodeGenModule->alias_empty() && + CachedInCodeGenModule->ifunc_empty())) && + "CodeGen wrote to a readonly module"); std::unique_ptr M(CG->ReleaseModule()); CG->StartModule("incr_module_" + std::to_string(ID++), M->getContext()); return M; diff --git a/clang/lib/Interpreter/IncrementalParser.h b/clang/lib/Interpreter/IncrementalParser.h index e13b74c7f65948..f63bce50acd3b9 100644 --- a/clang/lib/Interpreter/IncrementalParser.h +++ b/clang/lib/Interpreter/IncrementalParser.h @@ -24,6 +24,7 @@ #include namespace llvm { class LLVMContext; +class Module; } // namespace llvm namespace clang { @@ -57,6 +58,10 @@ class IncrementalParser { /// of code. std::list PTUs; + /// When CodeGen is created the first llvm::Module gets cached in many places + /// and we must keep it alive. + std::unique_ptr CachedInCodeGenModule; + IncrementalParser(); public: diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 6778bc6607f75e..2ef95741b3d637 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3507,11 +3507,15 @@ bool Sema::ParseSVEImmChecks( static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) { if (FD->hasAttr()) return ArmStreaming; - if (const auto *T = FD->getType()->getAs()) { - if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask) - return ArmStreaming; - if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask) - return ArmStreamingCompatible; + if (const Type *Ty = FD->getType().getTypePtrOrNull()) { + if (const auto *FPT = Ty->getAs()) { + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMEnabledMask) + return ArmStreaming; + if (FPT->getAArch64SMEAttributes() & + FunctionType::SME_PStateSMCompatibleMask) + return ArmStreamingCompatible; + } } return ArmNonStreaming; } @@ -7949,6 +7953,7 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, // For variadic functions, we may have more args than parameters. // For some K&R functions, we may have less args than parameters. const auto N = std::min(Proto->getNumParams(), Args.size()); + bool AnyScalableArgsOrRet = Proto->getReturnType()->isSizelessVectorType(); for (unsigned ArgIdx = 0; ArgIdx < N; ++ArgIdx) { // Args[ArgIdx] can be null in malformed code. if (const Expr *Arg = Args[ArgIdx]) { @@ -7962,6 +7967,8 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, checkAIXMemberAlignment((Arg->getExprLoc()), Arg); QualType ParamTy = Proto->getParamType(ArgIdx); + if (ParamTy->isSizelessVectorType()) + AnyScalableArgsOrRet = true; QualType ArgTy = Arg->getType(); CheckArgAlignment(Arg->getExprLoc(), FDecl, std::to_string(ArgIdx + 1), ArgTy, ParamTy); @@ -7982,6 +7989,23 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, } } + // If the call requires a streaming-mode change and has scalable vector + // arguments or return values, then warn the user that the streaming and + // non-streaming vector lengths may be different. + const auto *CallerFD = dyn_cast(CurContext); + if (CallerFD && (!FD || !FD->getBuiltinID()) && AnyScalableArgsOrRet) { + bool IsCalleeStreaming = + ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask; + bool IsCalleeStreamingCompatible = + ExtInfo.AArch64SMEAttributes & + FunctionType::SME_PStateSMCompatibleMask; + ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD); + if (!IsCalleeStreamingCompatible && + (CallerFnType == ArmStreamingCompatible || + ((CallerFnType == ArmStreaming) ^ IsCalleeStreaming))) + Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming); + } + FunctionType::ArmStateValue CalleeArmZAState = FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes); FunctionType::ArmStateValue CalleeArmZT0State = @@ -7990,7 +8014,7 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, CalleeArmZT0State != FunctionType::ARM_None) { bool CallerHasZAState = false; bool CallerHasZT0State = false; - if (const auto *CallerFD = dyn_cast(CurContext)) { + if (CallerFD) { auto *Attr = CallerFD->getAttr(); if (Attr && Attr->isNewZA()) CallerHasZAState = true; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index af6b3f21f15a65..5fed554d9e25c3 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -12408,12 +12408,22 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, } // Check if the function definition uses any AArch64 SME features without - // having the '+sme' feature enabled. + // having the '+sme' feature enabled and warn user if sme locally streaming + // function returns or uses arguments with VL-based types. if (DeclIsDefn) { const auto *Attr = NewFD->getAttr(); bool UsesSM = NewFD->hasAttr(); bool UsesZA = Attr && Attr->isNewZA(); bool UsesZT0 = Attr && Attr->isNewZT0(); + + if (NewFD->hasAttr()) { + if (NewFD->getReturnType()->isSizelessVectorType() || + llvm::any_of(NewFD->parameters(), [](ParmVarDecl *P) { + return P->getOriginalType()->isSizelessVectorType(); + })) + Diag(NewFD->getLocation(), + diag::warn_sme_locally_streaming_has_vl_args_returns); + } if (const auto *FPT = NewFD->getType()->getAs()) { FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo(); UsesSM |= diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 4909414c0c78d4..a4b681ae4f008e 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -1085,9 +1085,11 @@ ExprResult Sema::ActOnPackIndexingExpr(Scope *S, Expr *PackExpression, SourceLocation RSquareLoc) { bool isParameterPack = ::isParameterPack(PackExpression); if (!isParameterPack) { - CorrectDelayedTyposInExpr(IndexExpr); - Diag(PackExpression->getBeginLoc(), diag::err_expected_name_of_pack) - << PackExpression; + if (!PackExpression->containsErrors()) { + CorrectDelayedTyposInExpr(IndexExpr); + Diag(PackExpression->getBeginLoc(), diag::err_expected_name_of_pack) + << PackExpression; + } return ExprError(); } ExprResult Res = diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index ade33ec65038fd..180574f1cab819 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4759,8 +4759,6 @@ class TemplateArgumentLocInventIterator { const TemplateArgumentLoc *operator->() const { return &Arg; } }; - TemplateArgumentLocInventIterator() { } - explicit TemplateArgumentLocInventIterator(TreeTransform &Self, InputIterator Iter) : Self(Self), Iter(Iter) { } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 6dd87b5d200db6..8a4b36207c4734 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -166,9 +166,9 @@ namespace { std::optional> GetAffectingModuleMaps(const Preprocessor &PP, Module *RootModule) { - // Without implicit module map search, there's no good reason to know about - // any module maps that are not affecting. - if (!PP.getHeaderSearchInfo().getHeaderSearchOpts().ImplicitModuleMaps) + if (!PP.getHeaderSearchInfo() + .getHeaderSearchOpts() + .ModulesPruneNonAffectingModuleMaps) return std::nullopt; SmallVector ModulesToProcess{RootModule}; diff --git a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp index f02d20d45678b3..c7479d74eafc33 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp @@ -56,23 +56,23 @@ class CastValueChecker : public Checker { private: // These are known in the LLVM project. The pairs are in the following form: - // {{{namespace, call}, argument-count}, {callback, kind}} + // {{match-mode, {namespace, call}, argument-count}, {callback, kind}} const CallDescriptionMap> CDM = { - {{{"llvm", "cast"}, 1}, + {{CDM::SimpleFunc, {"llvm", "cast"}, 1}, {&CastValueChecker::evalCast, CallKind::Function}}, - {{{"llvm", "dyn_cast"}, 1}, + {{CDM::SimpleFunc, {"llvm", "dyn_cast"}, 1}, {&CastValueChecker::evalDynCast, CallKind::Function}}, - {{{"llvm", "cast_or_null"}, 1}, + {{CDM::SimpleFunc, {"llvm", "cast_or_null"}, 1}, {&CastValueChecker::evalCastOrNull, CallKind::Function}}, - {{{"llvm", "dyn_cast_or_null"}, 1}, + {{CDM::SimpleFunc, {"llvm", "dyn_cast_or_null"}, 1}, {&CastValueChecker::evalDynCastOrNull, CallKind::Function}}, - {{{"clang", "castAs"}, 0}, + {{CDM::CXXMethod, {"clang", "castAs"}, 0}, {&CastValueChecker::evalCastAs, CallKind::Method}}, - {{{"clang", "getAs"}, 0}, + {{CDM::CXXMethod, {"clang", "getAs"}, 0}, {&CastValueChecker::evalGetAs, CallKind::Method}}, - {{{"llvm", "isa"}, 1}, + {{CDM::SimpleFunc, {"llvm", "isa"}, 1}, {&CastValueChecker::evalIsa, CallKind::InstanceOf}}, - {{{"llvm", "isa_and_nonnull"}, 1}, + {{CDM::SimpleFunc, {"llvm", "isa_and_nonnull"}, 1}, {&CastValueChecker::evalIsaAndNonNull, CallKind::InstanceOf}}}; void evalCast(const CallEvent &Call, DefinedOrUnknownSVal DV, diff --git a/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp index be7be15022d360..3a0a01c23de03e 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ChrootChecker.cpp @@ -43,7 +43,8 @@ class ChrootChecker : public Checker { // This bug refers to possibly break out of a chroot() jail. const BugType BT_BreakJail{this, "Break out of jail"}; - const CallDescription Chroot{{"chroot"}, 1}, Chdir{{"chdir"}, 1}; + const CallDescription Chroot{CDM::CLibrary, {"chroot"}, 1}, + Chdir{CDM::CLibrary, {"chdir"}, 1}; public: ChrootChecker() {} diff --git a/clang/lib/StaticAnalyzer/Checkers/ErrnoTesterChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ErrnoTesterChecker.cpp index c46ebee0c94ff4..6076a6bc789737 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ErrnoTesterChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ErrnoTesterChecker.cpp @@ -70,13 +70,15 @@ class ErrnoTesterChecker : public Checker { using EvalFn = std::function; const CallDescriptionMap TestCalls{ - {{{"ErrnoTesterChecker_setErrno"}, 1}, &ErrnoTesterChecker::evalSetErrno}, - {{{"ErrnoTesterChecker_getErrno"}, 0}, &ErrnoTesterChecker::evalGetErrno}, - {{{"ErrnoTesterChecker_setErrnoIfError"}, 0}, + {{CDM::SimpleFunc, {"ErrnoTesterChecker_setErrno"}, 1}, + &ErrnoTesterChecker::evalSetErrno}, + {{CDM::SimpleFunc, {"ErrnoTesterChecker_getErrno"}, 0}, + &ErrnoTesterChecker::evalGetErrno}, + {{CDM::SimpleFunc, {"ErrnoTesterChecker_setErrnoIfError"}, 0}, &ErrnoTesterChecker::evalSetErrnoIfError}, - {{{"ErrnoTesterChecker_setErrnoIfErrorRange"}, 0}, + {{CDM::SimpleFunc, {"ErrnoTesterChecker_setErrnoIfErrorRange"}, 0}, &ErrnoTesterChecker::evalSetErrnoIfErrorRange}, - {{{"ErrnoTesterChecker_setErrnoCheckState"}, 0}, + {{CDM::SimpleFunc, {"ErrnoTesterChecker_setErrnoCheckState"}, 0}, &ErrnoTesterChecker::evalSetErrnoCheckState}}; }; diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp index 2e31c16e457c2e..cd1dd1b2fc511f 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp @@ -27,8 +27,8 @@ using namespace ento; namespace { class MmapWriteExecChecker : public Checker { - CallDescription MmapFn; - CallDescription MprotectFn; + CallDescription MmapFn{CDM::CLibrary, {"mmap"}, 6}; + CallDescription MprotectFn{CDM::CLibrary, {"mprotect"}, 3}; static int ProtWrite; static int ProtExec; static int ProtRead; @@ -36,7 +36,6 @@ class MmapWriteExecChecker : public Checker { "Security"}; public: - MmapWriteExecChecker() : MmapFn({"mmap"}, 6), MprotectFn({"mprotect"}, 3) {} void checkPreCall(const CallEvent &Call, CheckerContext &C) const; int ProtExecOv; int ProtReadOv; diff --git a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp index f7b7befe28ee7d..19877964bd900a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp @@ -129,9 +129,11 @@ static llvm::StringRef indefiniteArticleBasedOnVowel(char a) { class StdVariantChecker : public Checker { // Call descriptors to find relevant calls - CallDescription VariantConstructor{{"std", "variant", "variant"}}; - CallDescription VariantAssignmentOperator{{"std", "variant", "operator="}}; - CallDescription StdGet{{"std", "get"}, 1, 1}; + CallDescription VariantConstructor{CDM::CXXMethod, + {"std", "variant", "variant"}}; + CallDescription VariantAssignmentOperator{CDM::CXXMethod, + {"std", "variant", "operator="}}; + CallDescription StdGet{CDM::SimpleFunc, {"std", "get"}, 1, 1}; BugType BadVariantType{this, "BadVariantType", "BadVariantType"}; @@ -295,4 +297,4 @@ bool clang::ento::shouldRegisterStdVariantChecker( void clang::ento::registerStdVariantChecker(clang::ento::CheckerManager &mgr) { mgr.registerChecker(); -} \ No newline at end of file +} diff --git a/clang/lib/StaticAnalyzer/Checkers/StringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StringChecker.cpp index 2dc9e29ca90688..8f1c31763e212c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StringChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StringChecker.cpp @@ -27,7 +27,7 @@ class StringChecker : public Checker { mutable const FunctionDecl *StringConstCharPtrCtor = nullptr; mutable CanQualType SizeTypeTy; const CallDescription TwoParamStdStringCtor = { - {"std", "basic_string", "basic_string"}, 2, 2}; + CDM::CXXMethod, {"std", "basic_string", "basic_string"}, 2, 2}; bool isCharToStringCtor(const CallEvent &Call, const ASTContext &ACtx) const; diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp index eae162cda69310..a82f7caf16b291 100644 --- a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp @@ -30,7 +30,7 @@ class PutenvWithAutoChecker : public Checker { private: BugType BT{this, "'putenv' function should not be called with auto variables", categories::SecurityError}; - const CallDescription Putenv{{"putenv"}, 1}; + const CallDescription Putenv{CDM::CLibrary, {"putenv"}, 1}; public: void checkPostCall(const CallEvent &Call, CheckerContext &C) const; diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp index ebba181eb2d842..ba29c123139016 100644 --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp @@ -2358,11 +2358,12 @@ StoreRef RegionStoreManager::killBinding(Store ST, Loc L) { RegionBindingsRef RegionStoreManager::bind(RegionBindingsConstRef B, Loc L, SVal V) { - if (L.getAs()) + // We only care about region locations. + auto MemRegVal = L.getAs(); + if (!MemRegVal) return B; - // If we get here, the location should be a region. - const MemRegion *R = L.castAs().getRegion(); + const MemRegion *R = MemRegVal->getRegion(); // Check if the region is a struct region. if (const TypedValueRegion* TR = dyn_cast(R)) { diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index e19f19b2528c15..f46324ee9989eb 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -179,6 +179,11 @@ makeCommonInvocationForModuleBuild(CompilerInvocation CI) { CI.resetNonModularOptions(); CI.clearImplicitModuleBuildOptions(); + // The scanner takes care to avoid passing non-affecting module maps to the + // explicit compiles. No need to do extra work just to find out there are no + // module map files to prune. + CI.getHeaderSearchOpts().ModulesPruneNonAffectingModuleMaps = false; + // Remove options incompatible with explicit module build or are likely to // differ between identical modules discovered from different translation // units. diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp index 1a29a664d7ce54..0cbab1fcd91d09 100644 --- a/clang/test/AST/Interp/builtin-functions.cpp +++ b/clang/test/AST/Interp/builtin-functions.cpp @@ -633,3 +633,9 @@ void test7(void) { X = CFSTR("foo", "bar"); // both-error {{too many arguments to function call}} #endif } + +/// The actual value on my machine is 22, but I have a feeling this will be different +/// on other targets, so just checking for != 0 here. Light testing is fine since +/// the actual implementation uses analyze_os_log::computeOSLogBufferLayout(), which +/// is tested elsewhere. +static_assert(__builtin_os_log_format_buffer_size("%{mask.xyz}s", "abc") != 0, ""); diff --git a/clang/test/Analysis/gh-issue-89185.c b/clang/test/Analysis/gh-issue-89185.c new file mode 100644 index 00000000000000..8a907f198a5fd5 --- /dev/null +++ b/clang/test/Analysis/gh-issue-89185.c @@ -0,0 +1,14 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s + +void clang_analyzer_dump(char); +void clang_analyzer_dump_ptr(char*); + +// https://github.com/llvm/llvm-project/issues/89185 +void binding_to_label_loc() { + char *b = &&MyLabel; +MyLabel: + *b = 0; // no-crash + clang_analyzer_dump_ptr(b); // expected-warning {{&&MyLabel}} + clang_analyzer_dump(*b); // expected-warning {{Unknown}} + // FIXME: We should never reach here, as storing to a label is invalid. +} diff --git a/clang/test/ClangScanDeps/modules-full.cpp b/clang/test/ClangScanDeps/modules-full.cpp index 59efef0ecbaa64..a00a431eb56911 100644 --- a/clang/test/ClangScanDeps/modules-full.cpp +++ b/clang/test/ClangScanDeps/modules-full.cpp @@ -33,6 +33,7 @@ // CHECK-NEXT: "command-line": [ // CHECK-NEXT: "-cc1" // CHECK: "-emit-module" +// CHECK: "-fno-modules-prune-non-affecting-module-map-files" // CHECK: "-fmodule-file={{.*}}[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H2_DINCLUDE]]/header2-{{[A-Z0-9]+}}.pcm" // CHECK-NOT: "-fimplicit-module-maps" // CHECK: "-fmodule-name=header1" @@ -51,6 +52,7 @@ // CHECK-NEXT: "command-line": [ // CHECK-NEXT: "-cc1", // CHECK: "-emit-module", +// CHECK: "-fno-modules-prune-non-affecting-module-map-files" // CHECK-NOT: "-fimplicit-module-maps", // CHECK: "-fmodule-name=header1", // CHECK: "-fno-implicit-modules", @@ -68,6 +70,7 @@ // CHECK-NEXT: "command-line": [ // CHECK-NEXT: "-cc1", // CHECK: "-emit-module", +// CHECK: "-fno-modules-prune-non-affecting-module-map-files" // CHECK: "-fmodule-name=header2", // CHECK-NOT: "-fimplicit-module-maps", // CHECK: "-fno-implicit-modules", diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-htm.c b/clang/test/CodeGen/PowerPC/builtins-ppc-htm.c index 51585f27e0bc70..eeb13b097819cd 100644 --- a/clang/test/CodeGen/PowerPC/builtins-ppc-htm.c +++ b/clang/test/CodeGen/PowerPC/builtins-ppc-htm.c @@ -1,6 +1,6 @@ // REQUIRES: powerpc-registered-target // RUN: %clang_cc1 -target-feature +altivec -target-feature +htm -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -// RUN: not %clang_cc1 -target-feature +altivec -target-feature -htm -triple powerpc64-unknown-unknown -emit-llvm %s 2>&1 | FileCheck %s --check-prefix=ERROR +// RUN: not %clang_cc1 -target-feature +altivec -target-feature -htm -triple powerpc64-unknown-unknown -emit-llvm-only %s 2>&1 | FileCheck %s --check-prefix=ERROR void test1(long int *r, int code, long int *a, long int *b) { // CHECK-LABEL: define{{.*}} void @test1 diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c b/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c index f5149bf4ce8fda..485ef84df086b7 100644 --- a/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c +++ b/clang/test/CodeGen/PowerPC/builtins-ppc-vec-ins-error.c @@ -1,17 +1,17 @@ // REQUIRES: powerpc-registered-target // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64le-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify -D __TEST_ELT_SI +// RUN: -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_SI // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify -D __TEST_ELT_F +// RUN: -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_F // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64le-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify -D __TEST_ELT_SLL +// RUN: -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_SLL // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify -D __TEST_ELT_D +// RUN: -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_ELT_D // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64le-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify -D __TEST_UNALIGNED_UI +// RUN: -triple powerpc64le-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify -D __TEST_UNALIGNED_UI // RUN: %clang_cc1 -flax-vector-conversions=none -target-feature +vsx -target-cpu pwr10 -fsyntax-only \ -// RUN: -triple powerpc64-unknown-unknown -emit-llvm -ferror-limit 10 %s -verify +// RUN: -triple powerpc64-unknown-unknown -emit-llvm-only -ferror-limit 10 %s -verify #include diff --git a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c index b303d71304bf3e..66d15a7d1bc052 100644 --- a/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c +++ b/clang/test/CodeGen/RISCV/riscv-func-attr-target-err.c @@ -1,6 +1,6 @@ // REQUIRES: riscv-registered-target // RUN: not %clang_cc1 -triple riscv64 -target-feature +zifencei -target-feature +m -target-feature +a \ -// RUN: -emit-llvm %s 2>&1 | FileCheck %s +// RUN: -emit-llvm-only %s 2>&1 | FileCheck %s #include diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.c b/clang/test/CodeGen/attr-counted-by-pr88931.c new file mode 100644 index 00000000000000..cc3d751c7c6d83 --- /dev/null +++ b/clang/test/CodeGen/attr-counted-by-pr88931.c @@ -0,0 +1,40 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wno-missing-declarations -emit-llvm -o - %s | FileCheck %s + +struct foo { + int x,y,z; + struct bar { + int count; + int array[] __attribute__((counted_by(count))); + }; +}; + +void init(void * __attribute__((pass_dynamic_object_size(0)))); + +// CHECK-LABEL: define dso_local void @test1( +// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4 +// CHECK-NEXT: tail call void @init(ptr noundef nonnull [[ARRAY]], i64 noundef -1) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret void +// +void test1(struct bar *p) { + init(p->array); +} + +struct mux { + int count; + int array[] __attribute__((counted_by(count))); +}; + +struct bux { struct mux x; }; + +// CHECK-LABEL: define dso_local void @test2( +// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @init(ptr noundef [[P]], i64 noundef -1) #[[ATTR2]] +// CHECK-NEXT: ret void +// +void test2(struct bux *p) { + init(p); +} diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp new file mode 100644 index 00000000000000..2a8cc1d07e50d9 --- /dev/null +++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp @@ -0,0 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O2 -Wall -emit-llvm -o - %s | FileCheck %s + +struct foo { + struct bar { + int array[]; + bar(); + }; +}; + +void init(void * __attribute__((pass_dynamic_object_size(0)))); + +// CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]] +// CHECK-NEXT: ret void +// +foo::bar::bar() { + init(array); +} diff --git a/clang/test/CodeGen/debug-info-file-checksum.c b/clang/test/CodeGen/debug-info-file-checksum.c index e018dbf64fc98f..2ca91d605d1c4b 100644 --- a/clang/test/CodeGen/debug-info-file-checksum.c +++ b/clang/test/CodeGen/debug-info-file-checksum.c @@ -1,3 +1,6 @@ +// AIX does not support -gdwarf-5. +// UNSUPPORTED: target={{.*}}-aix{{.*}} + // RUN: %clang -emit-llvm -S -g -gcodeview -x c \ // RUN: %S/Inputs/debug-info-file-checksum.c -o - | FileCheck %s // RUN: %clang -emit-llvm -S -g -gcodeview -Xclang -gsrc-hash=md5 \ diff --git a/clang/test/CodeGen/pgo-force-function-attrs.ll b/clang/test/CodeGen/pgo-force-function-attrs.ll new file mode 100644 index 00000000000000..3e9ea95e4df410 --- /dev/null +++ b/clang/test/CodeGen/pgo-force-function-attrs.ll @@ -0,0 +1,12 @@ +; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=OPTSIZE +; Check that no profile means no optsize +; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE +; Check that no -pgo-cold-func-opt=optsize means no optsize +; RUN: %clang_cc1 -O2 -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE + +; NONE-NOT: optsize +; OPTSIZE: optsize + +define void @f() cold { + ret void +} diff --git a/clang/test/CodeGenCXX/mangle-lambdas-gh88906.cpp b/clang/test/CodeGenCXX/mangle-lambdas-gh88906.cpp new file mode 100644 index 00000000000000..e7592cec5da776 --- /dev/null +++ b/clang/test/CodeGenCXX/mangle-lambdas-gh88906.cpp @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -triple x86_64-linux-gnu %s -emit-llvm -mconstructor-aliases -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -fclang-abi-compat=18 %s -emit-llvm -mconstructor-aliases -o - | FileCheck --check-prefix=CLANG18 %s +// RUN: %clang_cc1 -triple i386-pc-win32 %s -emit-llvm -mconstructor-aliases -o - | FileCheck --check-prefix=MSABI %s + + +class func { +public: + template + func(T){}; + template + func(T, U){}; +}; + +void GH88906(){ + class Test{ + public: + func a{[]{ }, []{ }}; + func b{[]{ }}; + func c{[]{ }}; + } test; +} + +// CHECK-LABEL: define internal void @_ZZ7GH88906vEN4TestC2Ev +// CHECK: call void @_ZN4funcC2IN7GH889064Test1aMUlvE_ENS3_UlvE0_EEET_T0_ +// CHECK: call void @_ZN4funcC2IN7GH889064Test1bMUlvE_EEET_ +// CHECK: call void @_ZN4funcC2IN7GH889064Test1cMUlvE_EEET_ + +// CHECK-LABEL: define internal void @_ZN4funcC2IN7GH889064Test1aMUlvE_ENS3_UlvE0_EEET_T0_ +// CHECK-LABEL: define internal void @_ZN4funcC2IN7GH889064Test1bMUlvE_EEET_ +// CHECK-LABEL: define internal void @_ZN4funcC2IN7GH889064Test1cMUlvE_EEET_ + +// CLANG18-LABEL: define internal void @_ZZ7GH88906vEN4TestC2Ev +// CLANG18: call void @_ZN4funcC2IZ7GH88906vEN4TestUlvE_EZ7GH88906vENS1_UlvE0_EEET_T0_ +// CLANG18: call void @_ZN4funcC2IZ7GH88906vEN4TestUlvE_EEET_ +// CLANG18: call void @_ZN4funcC2IZ7GH88906vEN4TestUlvE_EEET_ + + + +// MSABI-LABEL: define internal x86_thiscallcc noundef ptr @"??0Test@?1??GH88906@@YAXXZ@QAE@XZ" +// MSABI: call x86_thiscallcc noundef ptr @"??$?0V@a@Test@?1??GH88906@@YAXXZ@V@12?1??3@YAXXZ@@func@@QAE@V@a@Test@?1??GH88906@@YAXXZ@V@23?1??4@YAXXZ@@Z" +// MSABI: call x86_thiscallcc noundef ptr @"??$?0V@b@Test@?1??GH88906@@YAXXZ@@func@@QAE@V@b@Test@?1??GH88906@@YAXXZ@@Z" +// MSABI: call x86_thiscallcc noundef ptr @"??$?0V@c@Test@?1??GH88906@@YAXXZ@@func@@QAE@V@c@Test@?1??GH88906@@YAXXZ@@Z" + +// MSABI-LABEL: define internal x86_thiscallcc noundef ptr @"??$?0V@a@Test@?1??GH88906@@YAXXZ@V@12?1??3@YAXXZ@@func@@QAE@V@a@Test@?1??GH88906@@YAXXZ@V@23?1??4@YAXXZ@@Z" +// MSABI-LABEL: define internal x86_thiscallcc noundef ptr @"??$?0V@b@Test@?1??GH88906@@YAXXZ@@func@@QAE@V@b@Test@?1??GH88906@@YAXXZ@@Z" +// MSABI-LABEL: define internal x86_thiscallcc noundef ptr @"??$?0V@c@Test@?1??GH88906@@YAXXZ@@func@@QAE@V@c@Test@?1??GH88906@@YAXXZ@@Z" diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl index 749eac6d64736d..bd4f38067a5c59 100644 --- a/clang/test/CodeGenHLSL/builtins/mad.hlsl +++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl @@ -1,182 +1,238 @@ // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ -// RUN: --check-prefixes=CHECK,NATIVE_HALF +// RUN: --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ -// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF + +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ +// RUN: --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_CHECK #ifdef __HLSL_ENABLE_16_BIT -// NATIVE_HALF: %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2) -// NATIVE_HALF: ret i16 %dx.umad +// DXIL_NATIVE_HALF: %dx.umad = call i16 @llvm.dx.umad.i16(i16 %0, i16 %1, i16 %2) +// DXIL_NATIVE_HALF: ret i16 %dx.umad +// SPIR_NATIVE_HALF: mul nuw i16 %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nuw i16 %{{.*}}, %{{.*}} uint16_t test_mad_uint16_t(uint16_t p0, uint16_t p1, uint16_t p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.umad = call <2 x i16> @llvm.dx.umad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2) -// NATIVE_HALF: ret <2 x i16> %dx.umad +// DXIL_NATIVE_HALF: %dx.umad = call <2 x i16> @llvm.dx.umad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2) +// DXIL_NATIVE_HALF: ret <2 x i16> %dx.umad +// SPIR_NATIVE_HALF: mul nuw <2 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nuw <2 x i16> %{{.*}}, %{{.*}} uint16_t2 test_mad_uint16_t2(uint16_t2 p0, uint16_t2 p1, uint16_t2 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.umad = call <3 x i16> @llvm.dx.umad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2) -// NATIVE_HALF: ret <3 x i16> %dx.umad +// DXIL_NATIVE_HALF: %dx.umad = call <3 x i16> @llvm.dx.umad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2) +// DXIL_NATIVE_HALF: ret <3 x i16> %dx.umad +// SPIR_NATIVE_HALF: mul nuw <3 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nuw <3 x i16> %{{.*}}, %{{.*}} uint16_t3 test_mad_uint16_t3(uint16_t3 p0, uint16_t3 p1, uint16_t3 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.umad = call <4 x i16> @llvm.dx.umad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) -// NATIVE_HALF: ret <4 x i16> %dx.umad +// DXIL_NATIVE_HALF: %dx.umad = call <4 x i16> @llvm.dx.umad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) +// DXIL_NATIVE_HALF: ret <4 x i16> %dx.umad +// SPIR_NATIVE_HALF: mul nuw <4 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nuw <4 x i16> %{{.*}}, %{{.*}} uint16_t4 test_mad_uint16_t4(uint16_t4 p0, uint16_t4 p1, uint16_t4 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2) -// NATIVE_HALF: ret i16 %dx.imad +// DXIL_NATIVE_HALF: %dx.imad = call i16 @llvm.dx.imad.i16(i16 %0, i16 %1, i16 %2) +// DXIL_NATIVE_HALF: ret i16 %dx.imad +// SPIR_NATIVE_HALF: mul nsw i16 %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nsw i16 %{{.*}}, %{{.*}} int16_t test_mad_int16_t(int16_t p0, int16_t p1, int16_t p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.imad = call <2 x i16> @llvm.dx.imad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2) -// NATIVE_HALF: ret <2 x i16> %dx.imad +// DXIL_NATIVE_HALF: %dx.imad = call <2 x i16> @llvm.dx.imad.v2i16(<2 x i16> %0, <2 x i16> %1, <2 x i16> %2) +// DXIL_NATIVE_HALF: ret <2 x i16> %dx.imad +// SPIR_NATIVE_HALF: mul nsw <2 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nsw <2 x i16> %{{.*}}, %{{.*}} int16_t2 test_mad_int16_t2(int16_t2 p0, int16_t2 p1, int16_t2 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.imad = call <3 x i16> @llvm.dx.imad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2) -// NATIVE_HALF: ret <3 x i16> %dx.imad +// DXIL_NATIVE_HALF: %dx.imad = call <3 x i16> @llvm.dx.imad.v3i16(<3 x i16> %0, <3 x i16> %1, <3 x i16> %2) +// DXIL_NATIVE_HALF: ret <3 x i16> %dx.imad +// SPIR_NATIVE_HALF: mul nsw <3 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nsw <3 x i16> %{{.*}}, %{{.*}} int16_t3 test_mad_int16_t3(int16_t3 p0, int16_t3 p1, int16_t3 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.imad = call <4 x i16> @llvm.dx.imad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) -// NATIVE_HALF: ret <4 x i16> %dx.imad +// DXIL_NATIVE_HALF: %dx.imad = call <4 x i16> @llvm.dx.imad.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) +// DXIL_NATIVE_HALF: ret <4 x i16> %dx.imad +// SPIR_NATIVE_HALF: mul nsw <4 x i16> %{{.*}}, %{{.*}} +// SPIR_NATIVE_HALF: add nsw <4 x i16> %{{.*}}, %{{.*}} int16_t4 test_mad_int16_t4(int16_t4 p0, int16_t4 p1, int16_t4 p2) { return mad(p0, p1, p2); } #endif // __HLSL_ENABLE_16_BIT -// NATIVE_HALF: %dx.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2) -// NATIVE_HALF: ret half %dx.fmad -// NO_HALF: %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) -// NO_HALF: ret float %dx.fmad +// NATIVE_HALF: %hlsl.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2) +// NATIVE_HALF: ret half %hlsl.fmad +// NO_HALF: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) +// NO_HALF: ret float %hlsl.fmad half test_mad_half(half p0, half p1, half p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2) -// NATIVE_HALF: ret <2 x half> %dx.fmad -// NO_HALF: %dx.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2) -// NO_HALF: ret <2 x float> %dx.fmad +// NATIVE_HALF: %hlsl.fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2) +// NATIVE_HALF: ret <2 x half> %hlsl.fmad +// NO_HALF: %hlsl.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2) +// NO_HALF: ret <2 x float> %hlsl.fmad half2 test_mad_half2(half2 p0, half2 p1, half2 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.fmad = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2) -// NATIVE_HALF: ret <3 x half> %dx.fmad -// NO_HALF: %dx.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2) -// NO_HALF: ret <3 x float> %dx.fmad +// NATIVE_HALF: %hlsl.fmad = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2) +// NATIVE_HALF: ret <3 x half> %hlsl.fmad +// NO_HALF: %hlsl.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2) +// NO_HALF: ret <3 x float> %hlsl.fmad half3 test_mad_half3(half3 p0, half3 p1, half3 p2) { return mad(p0, p1, p2); } -// NATIVE_HALF: %dx.fmad = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2) -// NATIVE_HALF: ret <4 x half> %dx.fmad -// NO_HALF: %dx.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) -// NO_HALF: ret <4 x float> %dx.fmad +// NATIVE_HALF: %hlsl.fmad = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2) +// NATIVE_HALF: ret <4 x half> %hlsl.fmad +// NO_HALF: %hlsl.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) +// NO_HALF: ret <4 x float> %hlsl.fmad half4 test_mad_half4(half4 p0, half4 p1, half4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) -// CHECK: ret float %dx.fmad +// CHECK: %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) +// CHECK: ret float %hlsl.fmad float test_mad_float(float p0, float p1, float p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2) -// CHECK: ret <2 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2) +// CHECK: ret <2 x float> %hlsl.fmad float2 test_mad_float2(float2 p0, float2 p1, float2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2) -// CHECK: ret <3 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2) +// CHECK: ret <3 x float> %hlsl.fmad float3 test_mad_float3(float3 p0, float3 p1, float3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) -// CHECK: ret <4 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) +// CHECK: ret <4 x float> %hlsl.fmad float4 test_mad_float4(float4 p0, float4 p1, float4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2) -// CHECK: ret double %dx.fmad +// CHECK: %hlsl.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2) +// CHECK: ret double %hlsl.fmad double test_mad_double(double p0, double p1, double p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2) -// CHECK: ret <2 x double> %dx.fmad +// CHECK: %hlsl.fmad = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2) +// CHECK: ret <2 x double> %hlsl.fmad double2 test_mad_double2(double2 p0, double2 p1, double2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> %0, <3 x double> %1, <3 x double> %2) -// CHECK: ret <3 x double> %dx.fmad +// CHECK: %hlsl.fmad = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> %0, <3 x double> %1, <3 x double> %2) +// CHECK: ret <3 x double> %hlsl.fmad double3 test_mad_double3(double3 p0, double3 p1, double3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2) -// CHECK: ret <4 x double> %dx.fmad +// CHECK: %hlsl.fmad = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %2) +// CHECK: ret <4 x double> %hlsl.fmad double4 test_mad_double4(double4 p0, double4 p1, double4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2) -// CHECK: ret i32 %dx.imad +// DXIL_CHECK: %dx.imad = call i32 @llvm.dx.imad.i32(i32 %0, i32 %1, i32 %2) +// DXIL_CHECK: ret i32 %dx.imad +// SPIR_CHECK: mul nsw i32 %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw i32 %{{.*}}, %{{.*}} int test_mad_int(int p0, int p1, int p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <2 x i32> @llvm.dx.imad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) -// CHECK: ret <2 x i32> %dx.imad +// DXIL_CHECK: %dx.imad = call <2 x i32> @llvm.dx.imad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) +// DXIL_CHECK: ret <2 x i32> %dx.imad +// SPIR_CHECK: mul nsw <2 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <2 x i32> %{{.*}}, %{{.*}} int2 test_mad_int2(int2 p0, int2 p1, int2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <3 x i32> @llvm.dx.imad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2) -// CHECK: ret <3 x i32> %dx.imad +// DXIL_CHECK: %dx.imad = call <3 x i32> @llvm.dx.imad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2) +// DXIL_CHECK: ret <3 x i32> %dx.imad +// SPIR_CHECK: mul nsw <3 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <3 x i32> %{{.*}}, %{{.*}} int3 test_mad_int3(int3 p0, int3 p1, int3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <4 x i32> @llvm.dx.imad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) -// CHECK: ret <4 x i32> %dx.imad +// DXIL_CHECK: %dx.imad = call <4 x i32> @llvm.dx.imad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) +// DXIL_CHECK: ret <4 x i32> %dx.imad +// SPIR_CHECK: mul nsw <4 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <4 x i32> %{{.*}}, %{{.*}} int4 test_mad_int4(int4 p0, int4 p1, int4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2) -// CHECK: ret i64 %dx.imad +// DXIL_CHECK: %dx.imad = call i64 @llvm.dx.imad.i64(i64 %0, i64 %1, i64 %2) +// DXIL_CHECK: ret i64 %dx.imad +// SPIR_CHECK: mul nsw i64 %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw i64 %{{.*}}, %{{.*}} int64_t test_mad_int64_t(int64_t p0, int64_t p1, int64_t p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <2 x i64> @llvm.dx.imad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) -// CHECK: ret <2 x i64> %dx.imad +// DXIL_CHECK: %dx.imad = call <2 x i64> @llvm.dx.imad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) +// DXIL_CHECK: ret <2 x i64> %dx.imad +// SPIR_CHECK: mul nsw <2 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <2 x i64> %{{.*}}, %{{.*}} int64_t2 test_mad_int64_t2(int64_t2 p0, int64_t2 p1, int64_t2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <3 x i64> @llvm.dx.imad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2) -// CHECK: ret <3 x i64> %dx.imad +// DXIL_CHECK: %dx.imad = call <3 x i64> @llvm.dx.imad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2) +// DXIL_CHECK: ret <3 x i64> %dx.imad +// SPIR_CHECK: mul nsw <3 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <3 x i64> %{{.*}}, %{{.*}} int64_t3 test_mad_int64_t3(int64_t3 p0, int64_t3 p1, int64_t3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.imad = call <4 x i64> @llvm.dx.imad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2) -// CHECK: ret <4 x i64> %dx.imad +// DXIL_CHECK: %dx.imad = call <4 x i64> @llvm.dx.imad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2) +// DXIL_CHECK: ret <4 x i64> %dx.imad +// SPIR_CHECK: mul nsw <4 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nsw <4 x i64> %{{.*}}, %{{.*}} int64_t4 test_mad_int64_t4(int64_t4 p0, int64_t4 p1, int64_t4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2) -// CHECK: ret i32 %dx.umad +// DXIL_CHECK: %dx.umad = call i32 @llvm.dx.umad.i32(i32 %0, i32 %1, i32 %2) +// DXIL_CHECK: ret i32 %dx.umad +// SPIR_CHECK: mul nuw i32 %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw i32 %{{.*}}, %{{.*}} uint test_mad_uint(uint p0, uint p1, uint p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <2 x i32> @llvm.dx.umad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) -// CHECK: ret <2 x i32> %dx.umad +// DXIL_CHECK: %dx.umad = call <2 x i32> @llvm.dx.umad.v2i32(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) +// DXIL_CHECK: ret <2 x i32> %dx.umad +// SPIR_CHECK: mul nuw <2 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <2 x i32> %{{.*}}, %{{.*}} uint2 test_mad_uint2(uint2 p0, uint2 p1, uint2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <3 x i32> @llvm.dx.umad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2) -// CHECK: ret <3 x i32> %dx.umad +// DXIL_CHECK: %dx.umad = call <3 x i32> @llvm.dx.umad.v3i32(<3 x i32> %0, <3 x i32> %1, <3 x i32> %2) +// DXIL_CHECK: ret <3 x i32> %dx.umad +// SPIR_CHECK: mul nuw <3 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <3 x i32> %{{.*}}, %{{.*}} uint3 test_mad_uint3(uint3 p0, uint3 p1, uint3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <4 x i32> @llvm.dx.umad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) -// CHECK: ret <4 x i32> %dx.umad +// DXIL_CHECK: %dx.umad = call <4 x i32> @llvm.dx.umad.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) +// DXIL_CHECK: ret <4 x i32> %dx.umad +// SPIR_CHECK: mul nuw <4 x i32> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <4 x i32> %{{.*}}, %{{.*}} uint4 test_mad_uint4(uint4 p0, uint4 p1, uint4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2) -// CHECK: ret i64 %dx.umad +// DXIL_CHECK: %dx.umad = call i64 @llvm.dx.umad.i64(i64 %0, i64 %1, i64 %2) +// DXIL_CHECK: ret i64 %dx.umad +// SPIR_CHECK: mul nuw i64 %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw i64 %{{.*}}, %{{.*}} uint64_t test_mad_uint64_t(uint64_t p0, uint64_t p1, uint64_t p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <2 x i64> @llvm.dx.umad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) -// CHECK: ret <2 x i64> %dx.umad +// DXIL_CHECK: %dx.umad = call <2 x i64> @llvm.dx.umad.v2i64(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) +// DXIL_CHECK: ret <2 x i64> %dx.umad +// SPIR_CHECK: mul nuw <2 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <2 x i64> %{{.*}}, %{{.*}} uint64_t2 test_mad_uint64_t2(uint64_t2 p0, uint64_t2 p1, uint64_t2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <3 x i64> @llvm.dx.umad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2) -// CHECK: ret <3 x i64> %dx.umad +// DXIL_CHECK: %dx.umad = call <3 x i64> @llvm.dx.umad.v3i64(<3 x i64> %0, <3 x i64> %1, <3 x i64> %2) +// DXIL_CHECK: ret <3 x i64> %dx.umad +// SPIR_CHECK: mul nuw <3 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <3 x i64> %{{.*}}, %{{.*}} uint64_t3 test_mad_uint64_t3(uint64_t3 p0, uint64_t3 p1, uint64_t3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.umad = call <4 x i64> @llvm.dx.umad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2) -// CHECK: ret <4 x i64> %dx.umad +// DXIL_CHECK: %dx.umad = call <4 x i64> @llvm.dx.umad.v4i64(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2) +// DXIL_CHECK: ret <4 x i64> %dx.umad +// SPIR_CHECK: mul nuw <4 x i64> %{{.*}}, %{{.*}} +// SPIR_CHECK: add nuw <4 x i64> %{{.*}}, %{{.*}} uint64_t4 test_mad_uint64_t4(uint64_t4 p0, uint64_t4 p1, uint64_t4 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2) -// CHECK: ret <2 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2) +// CHECK: ret <2 x float> %hlsl.fmad float2 test_mad_float2_splat(float p0, float2 p1, float2 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2) -// CHECK: ret <3 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2) +// CHECK: ret <3 x float> %hlsl.fmad float3 test_mad_float3_splat(float p0, float3 p1, float3 p2) { return mad(p0, p1, p2); } -// CHECK: %dx.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2) -// CHECK: ret <4 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2) +// CHECK: ret <4 x float> %hlsl.fmad float4 test_mad_float4_splat(float p0, float4 p1, float4 p2) { return mad(p0, p1, p2); } // CHECK: %conv = sitofp i32 %2 to float // CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0 // CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer -// CHECK: %dx.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat) -// CHECK: ret <2 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat) +// CHECK: ret <2 x float> %hlsl.fmad float2 test_mad_float2_int_splat(float2 p0, float2 p1, int p2) { return mad(p0, p1, p2); } @@ -184,8 +240,8 @@ float2 test_mad_float2_int_splat(float2 p0, float2 p1, int p2) { // CHECK: %conv = sitofp i32 %2 to float // CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0 // CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer -// CHECK: %dx.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat) -// CHECK: ret <3 x float> %dx.fmad +// CHECK: %hlsl.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat) +// CHECK: ret <3 x float> %hlsl.fmad float3 test_mad_float3_int_splat(float3 p0, float3 p1, int p2) { return mad(p0, p1, p2); } diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 5b6dfe308a76ea..7731300ae9f525 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -790,6 +790,7 @@ // RUN: %clang_cl -vctoolsdir "" /arm64EC /c -### -- %s 2>&1 | FileCheck --check-prefix=ARM64EC %s // ARM64EC-NOT: /arm64EC has been overridden by specified target // ARM64EC: "-triple" "arm64ec-pc-windows-msvc19.33.0" +// ARM64EC-SAME: "--dependent-lib=softintrin" // RUN: %clang_cl -vctoolsdir "" /arm64EC /c -target x86_64-pc-windows-msvc -### -- %s 2>&1 | FileCheck --check-prefix=ARM64EC_OVERRIDE %s // ARM64EC_OVERRIDE: warning: /arm64EC has been overridden by specified target: x86_64-pc-windows-msvc; option ignored diff --git a/clang/test/Driver/clang-offload-bundler-zstd.c b/clang/test/Driver/clang-offload-bundler-zstd.c index 4485e57309bbbc..a424981c69716f 100644 --- a/clang/test/Driver/clang-offload-bundler-zstd.c +++ b/clang/test/Driver/clang-offload-bundler-zstd.c @@ -22,19 +22,22 @@ // Check compression/decompression of offload bundle. // // RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ -// RUN: -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc -compress -verbose 2>&1 | \ -// RUN: FileCheck -check-prefix=COMPRESS %s +// RUN: -input=%t.tgt1 -input=%t.tgt2 -output=%t.hip.bundle.bc -compress -verbose >%t.1.txt 2>&1 // RUN: clang-offload-bundler -type=bc -list -input=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s // RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \ -// RUN: -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle -verbose 2>&1 | \ -// RUN: FileCheck -check-prefix=DECOMPRESS %s +// RUN: -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.hip.bundle.bc -unbundle -verbose >%t.2.txt 2>&1 +// RUN: cat %t.1.txt %t.2.txt | FileCheck %s // RUN: diff %t.tgt1 %t.res.tgt1 // RUN: diff %t.tgt2 %t.res.tgt2 // -// COMPRESS: Compression method used: zstd -// COMPRESS: Compression level: 3 -// DECOMPRESS: Decompression method: zstd -// DECOMPRESS: Hashes match: Yes +// CHECK: Compressed bundle format version: 2 +// CHECK: Total file size (including headers): [[SIZE:[0-9]*]] bytes +// CHECK: Compression method used: zstd +// CHECK: Compression level: 3 +// CHECK: Compressed bundle format version: 2 +// CHECK: Total file size (from header): [[SIZE]] bytes +// CHECK: Decompression method: zstd +// CHECK: Hashes match: Yes // NOHOST-NOT: host- // NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx900 // NOHOST-DAG: hip-amdgcn-amd-amdhsa--gfx906 diff --git a/clang/test/Driver/debug-options-embed-source.c b/clang/test/Driver/debug-options-embed-source.c new file mode 100644 index 00000000000000..acb00fca62ee03 --- /dev/null +++ b/clang/test/Driver/debug-options-embed-source.c @@ -0,0 +1,13 @@ +// AIX does not support -gdwarf-5 which is required by -gembed-source +// UNSUPPORTED: target={{.*}}-aix{{.*}} + +// RUN: %clang -### -gdwarf-5 -gembed-source %s 2>&1 | FileCheck -check-prefix=GEMBED_5 %s +// RUN: not %clang -### -gdwarf-2 -gembed-source %s 2>&1 | FileCheck -check-prefix=GEMBED_2 %s +// RUN: %clang -### -gdwarf-5 -gno-embed-source %s 2>&1 | FileCheck -check-prefix=NOGEMBED_5 %s +// RUN: %clang -### -gdwarf-2 -gno-embed-source %s 2>&1 | FileCheck -check-prefix=NOGEMBED_2 %s +// +// GEMBED_5: "-gembed-source" +// GEMBED_2: error: invalid argument '-gembed-source' only allowed with '-gdwarf-5' +// NOGEMBED_5-NOT: "-gembed-source" +// NOGEMBED_2-NOT: error: invalid argument '-gembed-source' only allowed with '-gdwarf-5' +// diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c index a6acfe88a38611..b209c911d1ca2b 100644 --- a/clang/test/Driver/debug-options.c +++ b/clang/test/Driver/debug-options.c @@ -410,16 +410,6 @@ // MACRO: "-debug-info-macro" // NOMACRO-NOT: "-debug-info-macro" // -// RUN: %clang -### -gdwarf-5 -gembed-source %s 2>&1 | FileCheck -check-prefix=GEMBED_5 %s -// RUN: not %clang -### -gdwarf-2 -gembed-source %s 2>&1 | FileCheck -check-prefix=GEMBED_2 %s -// RUN: %clang -### -gdwarf-5 -gno-embed-source %s 2>&1 | FileCheck -check-prefix=NOGEMBED_5 %s -// RUN: %clang -### -gdwarf-2 -gno-embed-source %s 2>&1 | FileCheck -check-prefix=NOGEMBED_2 %s -// -// GEMBED_5: "-gembed-source" -// GEMBED_2: error: invalid argument '-gembed-source' only allowed with '-gdwarf-5' -// NOGEMBED_5-NOT: "-gembed-source" -// NOGEMBED_2-NOT: error: invalid argument '-gembed-source' only allowed with '-gdwarf-5' -// // RUN: %clang -### -g -fno-eliminate-unused-debug-types -c %s 2>&1 \ // RUN: | FileCheck -check-prefix=DEBUG_UNUSED_TYPES %s // DEBUG_UNUSED_TYPES: "-debug-info-kind=unused-types" diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c index 5e1db5ba1ed3e9..cfe293cd4667ff 100644 --- a/clang/test/Driver/riscv-features.c +++ b/clang/test/Driver/riscv-features.c @@ -37,6 +37,8 @@ // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS // RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS +// RUN: touch %t.o +// RUN: %clang --target=riscv32-unknown-elf -### %t.o -mno-strict-align -mstrict-align // FAST-UNALIGNED-ACCESS: "-target-feature" "+unaligned-scalar-mem" "-target-feature" "+unaligned-vector-mem" // NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-unaligned-scalar-mem" "-target-feature" "-unaligned-vector-mem" diff --git a/clang/test/Modules/add-remove-irrelevant-module-map.m b/clang/test/Modules/add-remove-irrelevant-module-map.m deleted file mode 100644 index 7e3e58037e6f21..00000000000000 --- a/clang/test/Modules/add-remove-irrelevant-module-map.m +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: rm -rf %t && mkdir %t -// RUN: split-file %s %t - -//--- a/module.modulemap -module a {} - -//--- b/module.modulemap -module b {} - -//--- c/module.modulemap -module c {} - -//--- module.modulemap -module m { header "m.h" } -//--- m.h -@import c; - -//--- test-simple.m -// expected-no-diagnostics -@import m; - -// Build modules with the non-affecting "a/module.modulemap". -// RUN: %clang_cc1 -I %t/a -I %t/b -I %t/c -I %t -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -fdisable-module-hash %t/test-simple.m -verify -// RUN: mv %t/cache %t/cache-with - -// Build modules without the non-affecting "a/module.modulemap". -// RUN: rm -rf %t/a/module.modulemap -// RUN: %clang_cc1 -I %t/a -I %t/b -I %t/c -I %t -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -fdisable-module-hash %t/test-simple.m -verify -// RUN: mv %t/cache %t/cache-without - -// Check that the PCM files are bit-for-bit identical. -// RUN: diff %t/cache-with/m.pcm %t/cache-without/m.pcm diff --git a/clang/test/Modules/prune-non-affecting-module-map-files.m b/clang/test/Modules/prune-non-affecting-module-map-files.m new file mode 100644 index 00000000000000..ba2b3a306eaf46 --- /dev/null +++ b/clang/test/Modules/prune-non-affecting-module-map-files.m @@ -0,0 +1,62 @@ +// Check that the presence of non-affecting module map files does not affect the +// contents of PCM files. + +// RUN: rm -rf %t && mkdir %t +// RUN: split-file %s %t + +//--- a/module.modulemap +module a {} + +//--- b/module.modulemap +module b {} + +//--- c/module.modulemap +module c { header "c.h" } +//--- c/c.h +@import b; + +//--- tu.m +@import c; + +//--- explicit-mms-common-args.rsp +-fmodule-map-file=b/module.modulemap -fmodule-map-file=c/module.modulemap -fmodules -fmodules-cache-path=cache -fdisable-module-hash -fsyntax-only tu.m +//--- implicit-search-args.rsp +-I a -I b -I c -fimplicit-module-maps -fmodules -fmodules-cache-path=cache -fdisable-module-hash -fsyntax-only tu.m +//--- implicit-search-args.rsp-end + +// Test with explicit module map files. +// +// RUN: %clang_cc1 -working-directory %t @%t/explicit-mms-common-args.rsp +// RUN: mv %t/cache %t/cache-explicit-no-a-prune +// RUN: %clang_cc1 -working-directory %t @%t/explicit-mms-common-args.rsp -fno-modules-prune-non-affecting-module-map-files +// RUN: mv %t/cache %t/cache-explicit-no-a-keep +// +// RUN: %clang_cc1 -working-directory %t -fmodule-map-file=a/module.modulemap @%t/explicit-mms-common-args.rsp +// RUN: mv %t/cache %t/cache-explicit-a-prune +// RUN: %clang_cc1 -working-directory %t -fmodule-map-file=a/module.modulemap @%t/explicit-mms-common-args.rsp -fno-modules-prune-non-affecting-module-map-files +// RUN: mv %t/cache %t/cache-explicit-a-keep +// +// RUN: diff %t/cache-explicit-no-a-prune/c.pcm %t/cache-explicit-a-prune/c.pcm +// RUN: not diff %t/cache-explicit-no-a-keep/c.pcm %t/cache-explicit-a-keep/c.pcm + +// Test with implicit module map search. +// +// RUN: %clang_cc1 -working-directory %t @%t/implicit-search-args.rsp +// RUN: mv %t/cache %t/cache-implicit-no-a-prune +// RUN: %clang_cc1 -working-directory %t @%t/implicit-search-args.rsp -fno-modules-prune-non-affecting-module-map-files +// RUN: mv %t/cache %t/cache-implicit-no-a-keep +// +// FIXME: Instead of removing "a/module.modulemap" from the file system, we +// could drop the "-I a" search path argument in combination with the +// "-fmodules-skip-header-search-paths" flag. Unfortunately, that flag +// does not prevent serialization of the search path usage bit vector, +// making the files differ anyways. +// RUN: rm %t/a/module.modulemap +// +// RUN: %clang_cc1 -working-directory %t @%t/implicit-search-args.rsp +// RUN: mv %t/cache %t/cache-implicit-a-prune +// RUN: %clang_cc1 -working-directory %t @%t/implicit-search-args.rsp -fno-modules-prune-non-affecting-module-map-files +// RUN: mv %t/cache %t/cache-implicit-a-keep +// +// RUN: diff %t/cache-implicit-no-a-prune/c.pcm %t/cache-implicit-a-prune/c.pcm +// RUN: not diff %t/cache-implicit-no-a-keep/c.pcm %t/cache-implicit-a-keep/c.pcm diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 55c97c73e8b695..6a1feeb9bf5397 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ -// RUN: -target-feature +sme2 -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s +// RUN: -target-feature +sme2 -target-feature +sve2 -target-feature +neon -Waarch64-sme-attributes -fsyntax-only -verify %s // REQUIRES: aarch64-registered-target @@ -33,6 +33,7 @@ svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); } +// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svuint32_t incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); @@ -48,6 +49,7 @@ svuint32_t incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) __arm_streamin return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); } +// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svuint32_t incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); @@ -68,6 +70,7 @@ svfloat64_t streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_ return svadd_n_f64_m(pg, a, b); } +// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svfloat64_t locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { // expected-no-warning return svadd_n_f64_m(pg, a, b); @@ -83,6 +86,7 @@ svint16_t streaming_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming { return svmul_lane_s16(op1, op2, 0); } +// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svint16_t locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) { // expected-no-warning return svmul_lane_s16(op1, op2, 0); diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c index bfc8768c3f36e1..12de16509ccb8d 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs.c +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify=expected-cpp -x c++ %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s // Valid attributes @@ -496,3 +496,135 @@ void fmv_caller() { just_fine(); incompatible_locally_streaming(); } + +void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { } + +__SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; } + +void sme_streaming_compatible_with_vl_arg(__SVInt8_t a) __arm_streaming_compatible { } + +__SVInt8_t sme_streaming_compatible_returns_vl(void) __arm_streaming_compatible { __SVInt8_t r; return r; } + +void sme_no_streaming_with_vl_arg(__SVInt8_t a) { } + +__SVInt8_t sme_no_streaming_returns_vl(void) { __SVInt8_t r; return r; } + +// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} +// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} +__arm_locally_streaming void sme_locally_streaming_with_vl_arg(__SVInt8_t a) { } + +// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} +// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} +__arm_locally_streaming __SVInt8_t sme_locally_streaming_returns_vl(void) { __SVInt8_t r; return r; } + +void sme_no_streaming_calling_streaming_with_vl_args() { + __SVInt8_t a; + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + sme_streaming_with_vl_arg(a); +} + +void sme_no_streaming_calling_streaming_with_return_vl() { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + __SVInt8_t r = sme_streaming_returns_vl(); +} + +void sme_streaming_calling_non_streaming_with_vl_args(void) __arm_streaming { + __SVInt8_t a; + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + sme_no_streaming_with_vl_arg(a); +} + +void sme_streaming_calling_non_streaming_with_return_vl(void) __arm_streaming { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + __SVInt8_t r = sme_no_streaming_returns_vl(); +} + +void sme_no_streaming_calling_streaming_with_vl_args_param(__SVInt8_t arg, void (*sc)( __SVInt8_t arg) __arm_streaming) { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + sc(arg); +} + +__SVInt8_t sme_no_streaming_calling_streaming_return_vl_param(__SVInt8_t (*s)(void) __arm_streaming) { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + return s(); +} + +void sme_streaming_compatible_calling_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + sme_streaming_with_vl_arg(arg); +} + +void sme_streaming_compatible_calling_sme_streaming_return_vl(void) __arm_streaming_compatible { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + __SVInt8_t r = sme_streaming_returns_vl(); +} + +void sme_streaming_compatible_calling_no_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + sme_no_streaming_with_vl_arg(arg); +} + +void sme_streaming_compatible_calling_no_sme_streaming_return_vl(void) __arm_streaming_compatible { + // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} + __SVInt8_t r = sme_no_streaming_returns_vl(); +} + +void sme_streaming_calling_streaming(__SVInt8_t arg, void (*s)( __SVInt8_t arg) __arm_streaming) __arm_streaming { + s(arg); +} + +__SVInt8_t sme_streaming_calling_streaming_return_vl(__SVInt8_t (*s)(void) __arm_streaming) __arm_streaming { + return s(); +} + +void sme_streaming_calling_streaming_with_vl_args(__SVInt8_t a) __arm_streaming { + sme_streaming_with_vl_arg(a); +} + +void sme_streaming_calling_streaming_with_return_vl(void) __arm_streaming { + __SVInt8_t r = sme_streaming_returns_vl(); +} + +void sme_streaming_calling_streaming_compatible_with_vl_args(__SVInt8_t a) __arm_streaming { + sme_streaming_compatible_with_vl_arg(a); +} + +void sme_streaming_calling_streaming_compatible_with_return_vl(void) __arm_streaming { + __SVInt8_t r = sme_streaming_compatible_returns_vl(); +} + +void sme_no_streaming_calling_streaming_compatible_with_vl_args() { + __SVInt8_t a; + sme_streaming_compatible_with_vl_arg(a); +} + +void sme_no_streaming_calling_streaming_compatible_with_return_vl() { + __SVInt8_t r = sme_streaming_compatible_returns_vl(); +} + +void sme_no_streaming_calling_non_streaming_compatible_with_vl_args() { + __SVInt8_t a; + sme_no_streaming_with_vl_arg(a); +} + +void sme_no_streaming_calling_non_streaming_compatible_with_return_vl() { + __SVInt8_t r = sme_no_streaming_returns_vl(); +} + +void sme_streaming_compatible_calling_streaming_compatible_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { + sme_streaming_compatible_with_vl_arg(arg); +} + +void sme_streaming_compatible_calling_streaming_compatible_with_return_vl(void) __arm_streaming_compatible { + __SVInt8_t r = sme_streaming_compatible_returns_vl(); +} diff --git a/clang/test/Sema/ppc-attr-target-inline.c b/clang/test/Sema/ppc-attr-target-inline.c index 07ed0068226584..ad198b842bb032 100644 --- a/clang/test/Sema/ppc-attr-target-inline.c +++ b/clang/test/Sema/ppc-attr-target-inline.c @@ -1,5 +1,5 @@ // REQUIRES: powerpc-registered-target -// RUN: %clang_cc1 -triple powerpc64le -target-feature +htm -fsyntax-only -emit-llvm %s -verify +// RUN: %clang_cc1 -triple powerpc64le -target-feature +htm -fsyntax-only -emit-llvm-only %s -verify __attribute__((always_inline)) int test1(int *x) { diff --git a/clang/test/SemaCXX/aarch64-sve-resolve-type.cpp b/clang/test/SemaCXX/aarch64-sve-resolve-type.cpp new file mode 100644 index 00000000000000..7a563848cdcf98 --- /dev/null +++ b/clang/test/SemaCXX/aarch64-sve-resolve-type.cpp @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fsyntax-only %s + +// REQUIRES: aarch64-registered-target || arm-registered-target + +// expected-no-diagnostics + +struct a {}; +__SVFloat32_t b(a); +template using e = decltype(b(c())); +e f(a); +template using h = decltype(f(c())); +template struct i { + static void j() { + a d; + g()(d); + } +}; +struct k { + template void operator()(c) { + [](h) {}; + } + void l() { i::j; } +}; diff --git a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp index e13635383b6ca6..606715e6aacffd 100644 --- a/clang/test/SemaCXX/cxx2c-pack-indexing.cpp +++ b/clang/test/SemaCXX/cxx2c-pack-indexing.cpp @@ -154,3 +154,9 @@ void f() { } } + +namespace GH88929 { + bool b = a...[0]; // expected-error {{use of undeclared identifier 'a'}} + using E = P...[0]; // expected-error {{unknown type name 'P'}} \ + // expected-error {{expected ';' after alias declaration}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl index 862b9465207312..7bb5308c5d5ba7 100644 --- a/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/any-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected bool test_too_few_arg() { return __builtin_hlsl_elementwise_any(); diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl index 4c0e5315ce532e..f669098ef515d8 100644 --- a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_elementwise_clamp(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl index ba7ffc20484ae0..095f3c12ba8731 100644 --- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_no_second_arg(float2 p0) { return __builtin_hlsl_dot(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl index e2e79abb74a32d..321fc915ec01fc 100644 --- a/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/exp-errors.hlsl @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected -DTEST_FUNC=__builtin_elementwise_exp2 float test_too_few_arg() { return TEST_FUNC(); // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl index f82b5942fd46b6..f3cfbcf29d69c2 100644 --- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_too_few_arg() { return __builtin_hlsl_elementwise_frac(); diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl index 98c02c38675f4e..ef0928f8fef0d6 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl @@ -1,15 +1,15 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10 -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_ceil +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_cos +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_exp2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_floor +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log2 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10 +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc double2 test_double_builtin(double2 p0) { return TEST_FUNC(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl index 7ddfd56638273b..e2f03812705fd5 100644 --- a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected bool test_too_few_arg() { return __builtin_hlsl_elementwise_isinf(); diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl index 83751f68357edf..d23357239b7e8a 100644 --- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_lerp(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl index 97ce931bf1b5b5..636910b7ac8abc 100644 --- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float2 test_no_second_arg(float2 p0) { return __builtin_hlsl_mad(p0); diff --git a/clang/test/SemaHLSL/BuiltIns/pow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/pow-errors.hlsl index 949028aacf24b6..5a2c9a3ef62ff5 100644 --- a/clang/test/SemaHLSL/BuiltIns/pow-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/pow-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify double2 test_double_builtin(double2 p0, double2 p1) { return __builtin_elementwise_pow(p0,p1); diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl index fa6fd813f19e64..6bc5b9bed3047c 100644 --- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_too_few_arg() { return __builtin_hlsl_elementwise_rcp(); diff --git a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl index 6e66db6d1cca9e..49cd895d98c52d 100644 --- a/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/reversebits-errors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify double2 test_int_builtin(double2 p0) { diff --git a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl index fed4573063acb5..bede89015298b6 100644 --- a/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/round-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_too_few_arg() { return __builtin_elementwise_round(); diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl index c027a698c5e58f..e9a295172c7f8e 100644 --- a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected float test_too_few_arg() { return __builtin_hlsl_elementwise_rsqrt(); diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index acc596fef87b76..4ee64de697d37a 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -6721,6 +6721,23 @@ TEST_P(ASTImporterOptionSpecificTestBase, LambdaInFunctionBody) { EXPECT_FALSE(FromL->isDependentLambda()); } +TEST_P(ASTImporterOptionSpecificTestBase, + ReturnTypeDeclaredInsideOfCXX11LambdaWithoutTrailingReturn) { + Decl *From, *To; + std::tie(From, To) = getImportedDecl( + R"( + void foo() { + (void) []() { + struct X {}; + return X(); + }; + } + )", + Lang_CXX11, "", Lang_CXX11, "foo"); // c++11 only + auto *ToLambda = FirstDeclMatcher().match(To, lambdaExpr()); + EXPECT_TRUE(ToLambda); +} + TEST_P(ASTImporterOptionSpecificTestBase, LambdaInFunctionParam) { Decl *FromTU = getTuDecl( R"( diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 4f445c64ab303a..34999b7376397b 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -599,6 +599,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) { ASSERT_EQ(Tokens.size(), 6u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::r_paren, TT_CastRParen); + Tokens = annotate("(uint32_t)&&label;"); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::r_paren, TT_CastRParen); + EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_UnaryOperator); + EXPECT_TOKEN(Tokens[4], tok::identifier, TT_Unknown); + Tokens = annotate("auto x = (Foo)p;"); ASSERT_EQ(Tokens.size(), 9u) << Tokens; EXPECT_TOKEN(Tokens[5], tok::r_paren, TT_CastRParen); diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index 88e7b6e8546595..5f6546a6498c3c 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -658,9 +658,9 @@ class IRBuilderResult : public Result { std::vector Args; std::set AddressArgs; std::map IntegerArgs; - IRBuilderResult(StringRef CallPrefix, std::vector Args, - std::set AddressArgs, - std::map IntegerArgs) + IRBuilderResult(StringRef CallPrefix, const std::vector &Args, + const std::set &AddressArgs, + const std::map &IntegerArgs) : CallPrefix(CallPrefix), Args(Args), AddressArgs(AddressArgs), IntegerArgs(IntegerArgs) {} void genCode(raw_ostream &OS, @@ -727,8 +727,8 @@ class IRIntrinsicResult : public Result { std::string IntrinsicID; std::vector ParamTypes; std::vector Args; - IRIntrinsicResult(StringRef IntrinsicID, std::vector ParamTypes, - std::vector Args) + IRIntrinsicResult(StringRef IntrinsicID, const std::vector &ParamTypes, + const std::vector &Args) : IntrinsicID(std::string(IntrinsicID)), ParamTypes(ParamTypes), Args(Args) {} void genCode(raw_ostream &OS, diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h index 1e0cf1015ba67e..60f59bdd2f4c3e 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.h +++ b/compiler-rt/lib/scudo/standalone/allocator_config.h @@ -146,6 +146,7 @@ struct FuchsiaConfig { // Support 39-bit VMA for riscv-64 static const uptr RegionSizeLog = 28U; static const uptr GroupSizeLog = 19U; + static const bool EnableContiguousRegions = false; #else static const uptr RegionSizeLog = 30U; static const uptr GroupSizeLog = 21U; diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 71141e5efac488..c8e75024823f2c 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -81,12 +81,13 @@ if (FLANG_STANDALONE_BUILD) mark_as_advanced(LLVM_ENABLE_ASSERTIONS) endif() - # We need a pre-built/installed version of LLVM. - find_package(LLVM REQUIRED HINTS "${LLVM_CMAKE_DIR}") # If the user specifies a relative path to LLVM_DIR, the calls to include # LLVM modules fail. Append the absolute path to LLVM_DIR instead. - get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} REALPATH) + get_filename_component(LLVM_DIR_ABSOLUTE ${LLVM_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR_ABSOLUTE}) + # We need a pre-built/installed version of LLVM. + find_package(LLVM REQUIRED HINTS "${LLVM_DIR_ABSOLUTE}") # Users might specify a path to CLANG_DIR that's: # * a full path, or @@ -97,7 +98,7 @@ if (FLANG_STANDALONE_BUILD) CLANG_DIR_ABSOLUTE ${CLANG_DIR} REALPATH - ${CMAKE_CURRENT_SOURCE_DIR}) + BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) list(APPEND CMAKE_MODULE_PATH ${CLANG_DIR_ABSOLUTE}) # TODO: Remove when libclangDriver is lifted out of Clang @@ -124,13 +125,14 @@ if (FLANG_STANDALONE_BUILD) include(AddClang) include(TableGen) - find_package(MLIR REQUIRED CONFIG) - # Use SYSTEM for the same reasons as for LLVM includes - include_directories(SYSTEM ${MLIR_INCLUDE_DIRS}) # If the user specifies a relative path to MLIR_DIR, the calls to include # MLIR modules fail. Append the absolute path to MLIR_DIR instead. - get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} REALPATH) + get_filename_component(MLIR_DIR_ABSOLUTE ${MLIR_DIR} + REALPATH BASE_DIR ${CMAKE_CURRENT_BINARY_DIR}) list(APPEND CMAKE_MODULE_PATH ${MLIR_DIR_ABSOLUTE}) + find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR_ABSOLUTE}) + # Use SYSTEM for the same reasons as for LLVM includes + include_directories(SYSTEM ${MLIR_INCLUDE_DIRS}) include(AddMLIR) find_program(MLIR_TABLEGEN_EXE "mlir-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 814f3eb12cfcbc..ac120b4ff09b6d 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -266,26 +266,6 @@ is `ParseSyntaxOnlyAction`, which corresponds to `-fsyntax-only`. In other words, `flang-new -fc1 ` is equivalent to `flang-new -fc1 -fsyntax-only `. -## The `flang-to-external-fc` script -The `flang-to-external-fc` wrapper script for `flang-new` was introduced as a -development tool and to facilitate testing. The `flang-to-external-fc` wrapper -script will: -* use `flang-new` to unparse the input source file (i.e. it will run `flang-new - -fc1 -fdebug-unparse `), and then -* call a host Fortran compiler, e.g. `gfortran`, to compile the unparsed file. - -Here's a basic breakdown of what happens inside `flang-to-external-fc` when you -run `flang-to-external-fc file.f90`: -```bash -flang-new -fc1 -fdebug-unparse file.f90 -o file-unparsed.f90 -gfortran file-unparsed.f90 -``` -This is a simplified version for illustration purposes only. In practice, -`flang-to-external-fc` adds a few more frontend options and it also supports -various other use cases (e.g. compiling C files, linking existing object -files). `gfortran` is the default host compiler used by `flang-to-external-fc`. -You can change it by setting the `FLANG_FC` environment variable. - ## Adding new Compiler Options Adding a new compiler option in Flang consists of two steps: * define the new option in a dedicated TableGen file, diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 5a42e6a6aa4175..8bb2f83282b556 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -136,7 +136,7 @@ void DataSharingProcessor::insertBarrier() { void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { bool cmpCreated = false; - mlir::OpBuilder::InsertPoint localInsPt = firOpBuilder.saveInsertionPoint(); + mlir::OpBuilder::InsertionGuard guard(firOpBuilder); for (const omp::Clause &clause : clauses) { if (clause.id != llvm::omp::OMPC_lastprivate) continue; @@ -203,12 +203,11 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { // Lastprivate operation is inserted at the end // of the lexically last section in the sections // construct - mlir::OpBuilder::InsertPoint unstructuredSectionsIP = - firOpBuilder.saveInsertionPoint(); + mlir::OpBuilder::InsertionGuard unstructuredSectionsGuard( + firOpBuilder); mlir::Operation *lastOper = op->getRegion(0).back().getTerminator(); firOpBuilder.setInsertionPoint(lastOper); lastPrivIP = firOpBuilder.saveInsertionPoint(); - firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP); } } } else if (mlir::isa(op)) { @@ -268,7 +267,6 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) { "simd/worksharing-loop"); } } - firOpBuilder.restoreInsertionPoint(localInsPt); } void DataSharingProcessor::collectSymbols( @@ -372,7 +370,7 @@ void DataSharingProcessor::doPrivatize( uniquePrivatizerName)) return existingPrivatizer; - auto ip = firOpBuilder.saveInsertionPoint(); + mlir::OpBuilder::InsertionGuard guard(firOpBuilder); firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(), moduleOp.getBodyRegion().front().begin()); auto result = firOpBuilder.create( @@ -424,7 +422,6 @@ void DataSharingProcessor::doPrivatize( } symTable->popScope(); - firOpBuilder.restoreInsertionPoint(ip); return result; }(); diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 4424788e0132e2..db99617a7ba9f8 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -134,7 +134,7 @@ static void threadPrivatizeVars(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); mlir::Location currentLocation = converter.getCurrentLocation(); - mlir::OpBuilder::InsertPoint insPt = firOpBuilder.saveInsertionPoint(); + mlir::OpBuilder::InsertionGuard guard(firOpBuilder); firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); // If the symbol corresponds to the original ThreadprivateOp, use the symbol @@ -197,8 +197,6 @@ static void threadPrivatizeVars(Fortran::lower::AbstractConverter &converter, getExtendedValue(sexv, symThreadprivateValue); converter.bindSymbol(*sym, symThreadprivateExv); } - - firOpBuilder.restoreInsertionPoint(insPt); } static mlir::Operation * diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index f42386fe2736dd..9f8352a8025cee 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -220,12 +220,12 @@ mlir::Value ReductionProcessor::createScalarCombiner( switch (redId) { case ReductionIdentifier::MAX: reductionOp = - getReductionOperation( + getReductionOperation( builder, type, loc, op1, op2); break; case ReductionIdentifier::MIN: reductionOp = - getReductionOperation( + getReductionOperation( builder, type, loc, op1, op2); break; case ReductionIdentifier::IOR: @@ -486,9 +486,8 @@ createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc, assert(cstNeedsDealloc.has_value() && "createTempFromMold decides this statically"); if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) { - auto insPt = builder.saveInsertionPoint(); + mlir::OpBuilder::InsertionGuard guard(builder); createReductionCleanupRegion(builder, loc, reductionDecl); - builder.restoreInsertionPoint(insPt); } // Put the temporary inside of a box: diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index d909bda89cdeb4..921eac2f8f4b60 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2110,9 +2110,8 @@ struct XArrayCoorOpConversion const bool baseIsBoxed = coor.getMemref().getType().isa(); TypePair baseBoxTyPair = baseIsBoxed ? getBoxTypePair(coor.getMemref().getType()) : TypePair{}; - mlir::LLVM::IntegerOverflowFlagsAttr nsw = - mlir::LLVM::IntegerOverflowFlagsAttr::get( - rewriter.getContext(), mlir::LLVM::IntegerOverflowFlags::nsw); + mlir::LLVM::IntegerOverflowFlags nsw = + mlir::LLVM::IntegerOverflowFlags::nsw; // For each dimension of the array, generate the offset calculation. for (unsigned i = 0; i < rank; ++i, ++indexOffset, ++shapeOffset, @@ -2396,9 +2395,8 @@ struct CoordinateOpConversion auto cpnTy = fir::dyn_cast_ptrOrBoxEleTy(boxObjTy); mlir::Type llvmPtrTy = ::getLlvmPtrType(coor.getContext()); mlir::Type byteTy = ::getI8Type(coor.getContext()); - mlir::LLVM::IntegerOverflowFlagsAttr nsw = - mlir::LLVM::IntegerOverflowFlagsAttr::get( - rewriter.getContext(), mlir::LLVM::IntegerOverflowFlags::nsw); + mlir::LLVM::IntegerOverflowFlags nsw = + mlir::LLVM::IntegerOverflowFlags::nsw; for (unsigned i = 1, last = operands.size(); i < last; ++i) { if (auto arrTy = cpnTy.dyn_cast()) { diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp index a7244e1c58330a..b4c5660670579d 100644 --- a/flang/lib/Semantics/check-allocate.cpp +++ b/flang/lib/Semantics/check-allocate.cpp @@ -611,6 +611,20 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) { return false; } } + if (allocateInfo_.gotPinned) { + std::optional cudaAttr{GetCUDADataAttr(ultimate_)}; + if (!cudaAttr || *cudaAttr != common::CUDADataAttr::Pinned) { + context.Say(name_.source, + "Object in ALLOCATE must have PINNED attribute when PINNED option is specified"_err_en_US); + } + } + if (allocateInfo_.gotStream) { + std::optional cudaAttr{GetCUDADataAttr(ultimate_)}; + if (!cudaAttr || *cudaAttr != common::CUDADataAttr::Device) { + context.Say(name_.source, + "Object in ALLOCATE must have DEVICE attribute when STREAM option is specified"_err_en_US); + } + } return RunCoarrayRelatedChecks(context); } diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp index fb1ebadd378586..a9e57de7e2f2b5 100644 --- a/flang/lib/Semantics/check-cuda.cpp +++ b/flang/lib/Semantics/check-cuda.cpp @@ -344,6 +344,9 @@ template class DeviceContextChecker { [&](const common::Indirection &x) { WarnOnIoStmt(source); }, + [&](const common::Indirection &x) { + Check(x.value()); + }, [&](const auto &x) { if (auto msg{ActionStmtChecker::WhyNotOk(x)}) { context_.Say(source, std::move(*msg)); @@ -369,6 +372,13 @@ template class DeviceContextChecker { Check(std::get(eb->t)); } } + void Check(const parser::IfStmt &is) { + const auto &uS{ + std::get>(is.t)}; + CheckUnwrappedExpr( + context_, uS.source, std::get(is.t)); + Check(uS.statement, uS.source); + } void Check(const parser::LoopControl::Bounds &bounds) { Check(bounds.lower); Check(bounds.upper); diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 875929e90fdd31..6fcee96dd69059 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -948,17 +948,12 @@ void CheckHelper::CheckObjectEntity( "Component '%s' with ATTRIBUTES(DEVICE) must also be allocatable"_err_en_US, symbol.name()); } - if (IsAssumedSizeArray(symbol)) { - messages_.Say( - "Object '%s' with ATTRIBUTES(DEVICE) may not be assumed size"_err_en_US, - symbol.name()); - } break; case common::CUDADataAttr::Managed: if (!IsAutomatic(symbol) && !IsAllocatable(symbol) && - !details.isDummy()) { + !details.isDummy() && !evaluate::IsExplicitShape(symbol)) { messages_.Say( - "Object '%s' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, or a dummy argument"_err_en_US, + "Object '%s' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, explicit shape, or a dummy argument"_err_en_US, symbol.name()); } break; diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index bafa242a79302a..56653aa74f0cc5 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2286,6 +2286,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) { CheckReductionTypeList(x); } } + bool OmpStructureChecker::CheckReductionOperators( const parser::OmpClause::Reduction &x) { @@ -2356,6 +2357,16 @@ void OmpStructureChecker::CheckReductionTypeList( if (llvm::omp::nestedReduceWorkshareAllowedSet.test(GetContext().directive)) { CheckSharedBindingInOuterContext(ompObjectList); } + + SymbolSourceMap symbols; + GetSymbolsInObjectList(ompObjectList, symbols); + for (auto &[symbol, source] : symbols) { + if (IsProcedurePointer(*symbol)) { + context_.Say(source, + "A procedure pointer '%s' must not appear in a REDUCTION clause."_err_en_US, + symbol->name()); + } + } } void OmpStructureChecker::CheckIntentInPointerAndDefinable( diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf index 5b10334ecdbc14..251ff16a56c797 100644 --- a/flang/test/Lower/CUDA/cuda-allocatable.cuf +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -52,19 +52,19 @@ end subroutine ! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref>) {cuda_attr = #fir.cuda} -> i32 subroutine sub4() - real, allocatable, unified :: a(:) + real, allocatable, device :: a(:) integer :: istream allocate(a(10), stream=istream) end subroutine ! CHECK-LABEL: func.func @_QPsub4() ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub4Ea"} -! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub4Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub4Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"} ! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: fir.call @_FortranAAllocatableSetBounds ! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref -! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda} -> i32 +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda} -> i32 subroutine sub5() real, allocatable, device :: a(:) diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 index f0979ab95f568a..80b720e3aac1d1 100644 --- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 +++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 @@ -11,7 +11,7 @@ !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref !CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -!CHECK: %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32 +!CHECK: %[[RES:.*]] = arith.maxnumf %[[LD0]], %[[LD1]] {{.*}}: f32 !CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref !CHECK: omp.yield(%[[ARG0]] : !fir.ref) diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 index 996296c2adc2b5..c3b821ea591246 100644 --- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 +++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 @@ -6,7 +6,7 @@ !CHECK: omp.yield(%[[MINIMUM_VAL_F]] : f32) !CHECK: combiner !CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32): -!CHECK: %[[COMB_VAL_F:.*]] = arith.maximumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32 +!CHECK: %[[COMB_VAL_F:.*]] = arith.maxnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32 !CHECK: omp.yield(%[[COMB_VAL_F]] : f32) !CHECK: omp.declare_reduction @[[MAX_DECLARE_I:.*]] : i32 init { diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 index 24aa8e46e5bbbd..b284f8e5d96721 100644 --- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 +++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 @@ -11,7 +11,7 @@ !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref !CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -!CHECK: %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32 +!CHECK: %[[RES:.*]] = arith.minnumf %[[LD0]], %[[LD1]] {{.*}}: f32 !CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref !CHECK: omp.yield(%[[ARG0]] : !fir.ref) diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 index 268f51c9dc9330..ab33e180ed883d 100644 --- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 +++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 @@ -6,7 +6,7 @@ !CHECK: omp.yield(%[[MAXIMUM_VAL_F]] : f32) !CHECK: combiner !CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32): -!CHECK: %[[COMB_VAL_F:.*]] = arith.minimumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32 +!CHECK: %[[COMB_VAL_F:.*]] = arith.minnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32 !CHECK: omp.yield(%[[COMB_VAL_F]] : f32) !CHECK: omp.declare_reduction @[[MIN_DECLARE_I:.*]] : i32 init { diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 index ee562bbe15863e..2f6921edcb42a5 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 @@ -13,7 +13,7 @@ !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref !CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -!CHECK: %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32 +!CHECK: %[[RES:.*]] = arith.maxnumf %[[LD0]], %[[LD1]] {{.*}}: f32 !CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref !CHECK: omp.yield(%[[ARG0]] : !fir.ref) diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 index 6f11f0ec96a7d3..c9cf5cbf4f8c02 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-max.f90 @@ -10,7 +10,7 @@ ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32): -! CHECK: %[[VAL_2:.*]] = arith.maximumf %[[VAL_0]], %[[VAL_1]] fastmath : f32 +! CHECK: %[[VAL_2:.*]] = arith.maxnumf %[[VAL_0]], %[[VAL_1]] fastmath : f32 ! CHECK: omp.yield(%[[VAL_2]] : f32) ! CHECK: } diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 index c0372117a03b9d..84a376b46b8fbe 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 @@ -13,7 +13,7 @@ !CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref): !CHECK: %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref !CHECK: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref -!CHECK: %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32 +!CHECK: %[[RES:.*]] = arith.minnumf %[[LD0]], %[[LD1]] {{.*}}: f32 !CHECK: fir.store %[[RES]] to %[[ARG0]] : !fir.ref !CHECK: omp.yield(%[[ARG0]] : !fir.ref) diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 index 2c694f82e279a4..3ba279acd14c41 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min.f90 @@ -10,7 +10,7 @@ ! CHECK-LABEL: } combiner { ! CHECK: ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32): -! CHECK: %[[VAL_2:.*]] = arith.minimumf %[[VAL_0]], %[[VAL_1]] fastmath : f32 +! CHECK: %[[VAL_2:.*]] = arith.minnumf %[[VAL_0]], %[[VAL_1]] fastmath : f32 ! CHECK: omp.yield(%[[VAL_2]] : f32) ! CHECK: } diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common index 7f4217fb58355d..b097a6aa300458 100644 --- a/flang/test/Parser/cuf-sanity-common +++ b/flang/test/Parser/cuf-sanity-common @@ -32,6 +32,6 @@ module m call globalsub<<<1, 2>>> call globalsub<<<1, 2, 3>>> call globalsub<<<1, 2, 3, 4>>> - allocate(pa(32), stream = 1, pinned = isPinned) + allocate(pa(32), pinned = isPinned) end subroutine end module diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF index dc12759d3ce52f..2820441d5b5f0a 100644 --- a/flang/test/Parser/cuf-sanity-tree.CUF +++ b/flang/test/Parser/cuf-sanity-tree.CUF @@ -199,8 +199,6 @@ include "cuf-sanity-common" !CHECK: | | | | | | AllocateShapeSpec !CHECK: | | | | | | | Scalar -> Integer -> Expr = '32_4' !CHECK: | | | | | | | | LiteralConstant -> IntLiteralConstant = '32' -!CHECK: | | | | | AllocOpt -> Stream -> Scalar -> Integer -> Expr = '1_4' -!CHECK: | | | | | | LiteralConstant -> IntLiteralConstant = '1' !CHECK: | | | | | AllocOpt -> Pinned -> Scalar -> Logical -> Variable = 'ispinned' !CHECK: | | | | | | Designator -> DataRef -> Name = 'ispinned' !CHECK: | | | EndSubroutineStmt -> diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF index 7ac39448d7bd45..b6921e74fc05ae 100644 --- a/flang/test/Parser/cuf-sanity-unparse.CUF +++ b/flang/test/Parser/cuf-sanity-unparse.CUF @@ -37,6 +37,6 @@ include "cuf-sanity-common" !CHECK: CALL globalsub<<<1_4,2_4>>>() !CHECK: CALL globalsub<<<1_4,2_4,3_4>>>() !CHECK: CALL globalsub<<<1_4,2_4,3_4,4_4>>>() -!CHECK: ALLOCATE(pa(32_4), STREAM=1_4, PINNED=ispinned) +!CHECK: ALLOCATE(pa(32_4), PINNED=ispinned) !CHECK: END SUBROUTINE !CHECK: END MODULE diff --git a/flang/test/Semantics/OpenMP/reduction12.f90 b/flang/test/Semantics/OpenMP/reduction12.f90 new file mode 100644 index 00000000000000..f896ca4aa60b67 --- /dev/null +++ b/flang/test/Semantics/OpenMP/reduction12.f90 @@ -0,0 +1,16 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp + +! OpenMP 5.2: Section 5.5.5 : A procedure pointer must not appear in a +! reduction clause. + + procedure(foo), pointer :: ptr + integer :: i + ptr => foo +!ERROR: A procedure pointer 'ptr' must not appear in a REDUCTION clause. +!$omp do reduction (+ : ptr) + do i = 1, 10 + end do +contains + subroutine foo + end subroutine +end diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf index 8decb8dcaa0f47..472d53db7462ae 100644 --- a/flang/test/Semantics/cuf03.cuf +++ b/flang/test/Semantics/cuf03.cuf @@ -32,14 +32,11 @@ module m real, shared, target :: mst !ERROR: Object 'msa' with ATTRIBUTES(SHARED) must be declared in a device subprogram real, shared :: msa(*) - !ERROR: Object 'mm' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, or a dummy argument - real, managed :: mm - !ERROR: Object 'mmi' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, or a dummy argument - real, managed :: mmi = 1. + real, managed :: mm ! ok + real, managed :: mmi = 1. ! ok real, managed, allocatable :: mml ! ok - !ERROR: Object 'mmp' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, or a dummy argument - real, managed, pointer :: mmp ! ok - !ERROR: Object 'mmt' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, or a dummy argument + !ERROR: Object 'mmp' with ATTRIBUTES(MANAGED) must also be allocatable, automatic, explicit shape, or a dummy argument + real, managed, pointer :: mmp(:) real, managed, target :: mmt !WARNING: Object 'mp' with ATTRIBUTES(PINNED) should also be allocatable real, pinned :: mp @@ -60,8 +57,7 @@ module m contains attributes(device) subroutine devsubr(n,da) integer, intent(in) :: n - !ERROR: Object 'da' with ATTRIBUTES(DEVICE) may not be assumed size - real, device :: da(*) + real, device :: da(*) ! ok real, managed :: ma(n) ! ok !WARNING: Pointer 'dp' may not be associated in a device subprogram real, device, pointer :: dp diff --git a/flang/test/Semantics/cuf07.cuf b/flang/test/Semantics/cuf07.cuf index b520b5da51264b..c48abb5adf0d41 100644 --- a/flang/test/Semantics/cuf07.cuf +++ b/flang/test/Semantics/cuf07.cuf @@ -23,4 +23,20 @@ module m !BECAUSE: 'ma' is a host-associated allocatable and is not definable in a device subprogram deallocate(ma) end subroutine + + subroutine hostsub() + integer, allocatable, device :: ia(:) + logical :: plog + + !ERROR: Object in ALLOCATE must have PINNED attribute when PINNED option is specified + allocate(ia(100), pinned = plog) + end subroutine + + subroutine host2() + integer, allocatable, pinned :: ia(:) + integer :: istream + + !ERROR: Object in ALLOCATE must have DEVICE attribute when STREAM option is specified + allocate(ia(100), stream = istream) + end subroutine end module diff --git a/flang/test/Semantics/cuf11.cuf b/flang/test/Semantics/cuf11.cuf index de7ff29743242b..554ac258e55101 100644 --- a/flang/test/Semantics/cuf11.cuf +++ b/flang/test/Semantics/cuf11.cuf @@ -30,3 +30,7 @@ logical function compare_h(a,b) !ERROR: 'b' is not an object of derived type; it is implicitly typed compare_h = (a%h .eq. b%h) end + +attributes(global) subroutine sub2() + if (threadIdx%x == 1) print *, "I'm number one" +end subroutine diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index dda3b6887be89a..64815a1f5da622 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -93,17 +93,6 @@ endif() add_custom_target(module_files ALL DEPENDS ${MODULE_FILES}) -# This flang shell script will only work in a POSIX shell. -if (NOT WIN32) - configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/flang-to-external-fc.in - ${CMAKE_BINARY_DIR}/bin/flang-to-external-fc - @ONLY - ) - add_custom_target(flang-to-external-fc ALL DEPENDS ${CMAKE_BINARY_DIR}/bin/flang-to-external-fc) - install(PROGRAMS ${CMAKE_BINARY_DIR}/bin/flang-to-external-fc DESTINATION "${CMAKE_INSTALL_BINDIR}") -endif() - # TODO Move this to a more suitable location # Copy the generated omp_lib.h header file, if OpenMP support has been configured. if (LLVM_TOOL_OPENMP_BUILD) diff --git a/flang/tools/f18/flang-to-external-fc.in b/flang/tools/f18/flang-to-external-fc.in deleted file mode 100755 index bd16c030121cbb..00000000000000 --- a/flang/tools/f18/flang-to-external-fc.in +++ /dev/null @@ -1,497 +0,0 @@ -#! /usr/bin/env bash -#===-- tools/f18/flang-to-external-fc.sh --------------------------*- sh -*-===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===------------------------------------------------------------------------===# -# A wrapper script for Flang's compiler driver that was developed for testing and -# experimenting. You should be able to use it as a regular compiler driver. It -# will: -# * run Flang's compiler driver to unparse the input source files -# * use the external compiler (defined via FLANG_FC environment variable) to -# compile the unparsed source files -# -# Tested with Bash 4.4. This script will exit immediately if you use an -# older version of Bash. -#===------------------------------------------------------------------------===# -set -euo pipefail - -# Global variables to make the parsing of input arguments a bit easier -INPUT_FILES=() -OPTIONS=() -OUTPUT_FILE="" -MODULE_DIR="" -INTRINSICS_MOD_DIR="" -COMPILE_ONLY="False" -PREPROCESS_ONLY="False" -PRINT_VERSION="False" - -# === check_bash_version ====================================================== -# -# Checks the Bash version that's used to run this script. Exits immediately -# with a non-zero return code if it's lower than 4.4. Otherwise returns 0 -# (success). -# ============================================================================= -check_bash_version() { - message="Error: Your Bash is too old. Please use Bash >= 4.4" - # Major version - [[ "${BASH_VERSINFO[0]:-0}" -lt 4 ]] && echo $message && exit 1 - - # Minor version - if [[ "${BASH_VERSINFO[0]}" == 4 ]]; then - [[ "${BASH_VERSINFO[1]:-0}" -lt 4 ]] && echo $message && exit 1 - fi - - return 0 -} - -# === parse_args ============================================================== -# -# Parse the input arguments passed to this script. Sets the global variables -# declared at the top. -# -# INPUTS: -# $1 - all input arguments -# OUTPUTS: -# Saved in the global variables for this script -# ============================================================================= -parse_args() -{ - while [ "${1:-}" != "" ]; do - # CASE 1: Compiler option - if [[ "${1:0:1}" == "-" ]] ; then - # Output file - extract it into a global variable - if [[ "$1" == "-o" ]] ; then - shift - OUTPUT_FILE="$1" - shift - continue - fi - - # Module directory - extract it into a global variable - if [[ "$1" == "-module-dir" ]]; then - shift - MODULE_DIR="$1" - shift - continue - fi - - # Intrinsics module dir - extract it into a global var - if [[ "$1" == "-intrinsics-module-directory" ]]; then shift - INTRINSICS_MOD_DIR=$1 - shift - continue - fi - - # Module suffix cannot be modified - this script defines it before - # calling the driver. - if [[ "$1" == "-module-suffix" ]]; then - echo "ERROR: \'-module-suffix\' is not available when using the \'flang\' script" - exit 1 - fi - - # Special treatment for `J ` and `-I `. We translate these - # into `J` and `-I` respectively. - if [[ "$1" == "-J" ]] || [[ "$1" == "-I" ]]; then - opt=$1 - shift - OPTIONS+=("$opt$1") - shift - continue - fi - - # This is a regular option - just add it to the list. - OPTIONS+=($1) - if [[ $1 == "-c" ]]; then - COMPILE_ONLY="True" - fi - - if [[ $1 == "-S" ]]; then - COMPILE_ONLY="True" - fi - - if [[ $1 == "-E" ]]; then - PREPROCESS_ONLY="True" - fi - - if [[ $1 == "-v" || $1 == "--version" ]]; then - PRINT_VERSION="True" - fi - - shift - continue - - # CASE 2: A regular file (either source or a library file) - elif [[ -f "$1" ]]; then - INPUT_FILES+=($1) - shift - continue - - else - # CASE 3: Unsupported - echo "ERROR: unrecognised option format: \`$1\`. Perhaps non-existent file?" - exit 1 - fi - done -} - -# === categorise_files ======================================================== -# -# Categorises input files into: -# * Fortran source files (to be compiled) -# * library files (to be linked into the final executable) -# -# INPUTS: -# $1 - all input files to be categorised (array, name reference) -# OUTPUTS: -# $2 - Fortran source files extracted from $1 (array, name reference) -# $3 - other source files extracted from $1 (array, name reference) -# $4 - object files extracted from $1 (array, name reference) -# $5 - lib files extracted from $1 (array, name reference) -# ============================================================================= -categorise_files() -{ - local -n -r all_files=$1 - local -n fortran_sources=$2 - local -n other_sources=$3 - local -n objects=$4 - local -n libs=$5 - - for current_file in "${all_files[@]}"; do - file_ext=${current_file##*.} - if [[ $file_ext == "f" ]] || [[ $file_ext == "f90" ]] || - [[ $file_ext == "f" ]] || [[ $file_ext == "F" ]] || [[ $file_ext == "ff" ]] || - [[ $file_ext == "f90" ]] || [[ $file_ext == "F90" ]] || [[ $file_ext == "ff90" ]] || - [[ $file_ext == "f95" ]] || [[ $file_ext == "F95" ]] || [[ $file_ext == "ff95" ]] || - [[ $file_ext == "cuf" ]] || [[ $file_ext == "CUF" ]] || [[ $file_ext == "f18" ]] || - [[ $file_ext == "F18" ]] || [[ $file_ext == "ff18" ]]; then - fortran_sources+=($current_file) - elif [[ $file_ext == "a" ]] || [[ $file_ext == "so" ]]; then - libs+=($current_file) - elif [[ $file_ext == "o" ]]; then - objects+=($current_file) - else - other_sources+=($current_file) - fi - done -} - -# === categorise_opts ========================================================== -# -# Categorises compiler options into options for: -# * the Flang driver (either new or the "throwaway" driver) -# * the external Fortran driver that will generate the code -# Most options accepted by Flang will be claimed by it. The only exceptions are -# `-I` and `-J`. -# -# INPUTS: -# $1 - all compiler options (array, name reference) -# OUTPUTS: -# $2 - compiler options for the Flang driver (array, name reference) -# $3 - compiler options for the external driver (array, name reference) -# ============================================================================= -categorise_opts() -{ - local -n all_opts=$1 - local -n flang_opts=$2 - local -n fc_opts=$3 - - for opt in "${all_opts[@]}"; do - # These options are claimed by Flang, but should've been dealt with in parse_args. - if [[ $opt == "-module-dir" ]] || - [[ $opt == "-o" ]] || - [[ $opt == "-fintrinsic-modules-path" ]] ; then - echo "ERROR: $opt should've been fully processed by \`parse_args\`" - exit 1 - fi - - if - # The options claimed by Flang. This list needs to be compatible with - # what's supported by Flang's compiler driver (i.e. `flang-new`). - [[ $opt == "-cpp" ]] || - [[ $opt =~ ^-D.* ]] || - [[ $opt == "-E" ]] || - [[ $opt == "-falternative-parameter-statement" ]] || - [[ $opt == "-fbackslash" ]] || - [[ $opt == "-fcolor-diagnostics" ]] || - [[ $opt == "-fdefault-double-8" ]] || - [[ $opt == "-fdefault-integer-8" ]] || - [[ $opt == "-fdefault-real-8" ]] || - [[ $opt == "-ffixed-form" ]] || - [[ $opt =~ ^-ffixed-line-length=.* ]] || - [[ $opt == "-ffree-form" ]] || - [[ $opt == "-fimplicit-none" ]] || - [[ $opt =~ ^-finput-charset=.* ]] || - [[ $opt == "-flarge-sizes" ]] || - [[ $opt == "-flogical-abbreviations" ]] || - [[ $opt == "-fno-color-diagnostics" ]] || - [[ $opt == "-fxor-operator" ]] || - [[ $opt == "-help" ]] || - [[ $opt == "-nocpp" ]] || - [[ $opt == "-pedantic" ]] || - [[ $opt =~ ^-std=.* ]] || - [[ $opt =~ ^-U.* ]] || - [[ $opt == "-Werror" ]]; then - flang_opts+=($opt) - elif - # We translate the following into equivalents understood by `flang-new` - [[ $opt == "-Mfixed" ]] || [[ $opt == "-Mfree" ]]; then - case $opt in - -Mfixed) - flang_opts+=("-ffixed-form") - ;; - - -Mfree) - flang_opts+=("-ffree-form") - ;; - - *) - echo "ERROR: $opt has no equivalent in 'flang-new'" - exit 1 - ;; - esac - elif - # Options that are needed for both Flang and the external driver. - [[ $opt =~ -I.* ]] || - [[ $opt =~ -J.* ]] || - [[ $opt == "-fopenmp" ]] || - [[ $opt == "-fopenacc" ]]; then - flang_opts+=($opt) - fc_opts+=($opt) - else - # All other options are claimed for the external driver. - fc_opts+=($opt) - fi - done -} - -# === get_external_fc_name ==================================================== -# -# Returns the name of external Fortran compiler based on values of -# environment variables. -# ============================================================================= -get_external_fc_name() { - if [[ -v FLANG_FC ]]; then - echo ${FLANG_FC} - elif [[ -v F18_FC ]]; then - # We support F18_FC for backwards compatibility. - echo ${F18_FC} - else - echo gfortran - fi -} - -# === preprocess ============================================================== -# -# Runs the preprocessing. Fortran files are preprocessed using Flang. Other -# files are preprocessed using the external Fortran compiler. -# -# INPUTS: -# $1 - Fortran source files (array, name reference) -# $2 - other source files (array, name reference) -# $3 - compiler flags (array, name reference) -# ============================================================================= -preprocess() { - local -n fortran_srcs=$1 - local -n other_srcs=$2 - local -n opts=$3 - - local ext_fc="$(get_external_fc_name)" - - local -r wd=$(cd "$(dirname "$0")/.." && pwd) - - # Use the provided output file name. - if [[ ! -z ${OUTPUT_FILE:+x} ]]; then - output_definition="-o $OUTPUT_FILE" - fi - - # Preprocess fortran sources using Flang - for idx in "${!fortran_srcs[@]}"; do - if ! "$wd/bin/flang-new" -E "${opts[@]}" "${fortran_srcs[$idx]}" ${output_definition:+$output_definition} - then status=$? - echo flang: in "$PWD", flang-new failed with exit status $status: "$wd/bin/flang-new" "${opts[@]}" "$@" >&2 - exit $status - fi - done - - # Preprocess other sources using Flang - for idx in "${!other_srcs[@]}"; do - if ! $ext_fc -E "${opts[@]}" "${other_srcs[$idx]}" ${output_definition:+$output_definition} - then status=$? - echo flang: in "$PWD", flang-new failed with exit status $status: "$wd/bin/flang-new" "${opts[@]}" "$@" >&2 - exit $status - fi - done -} - -# === get_relocatable_name ====================================================== -# This method generates the name of the output file for the compilation phase -# (triggered with `-c`). If the user of this script is only interested in -# compilation (`flang -c`), use $OUTPUT_FILE provided that it was defined. -# Otherwise, use the usual heuristics: -# * file.f --> file.o -# * file.c --> file.o -# -# INPUTS: -# $1 - input source file for which to generate the output name -# ============================================================================= -get_relocatable_name() { - local -r src_file=$1 - - if [[ $COMPILE_ONLY == "True" ]] && [[ ! -z ${OUTPUT_FILE:+x} ]]; then - out_file="$OUTPUT_FILE" - else - current_ext=${src_file##*.} - new_ext="o" - - out_file=$(basename "${src_file}" "$current_ext")${new_ext} - fi - - echo "$out_file" -} - -# === main ==================================================================== -# Main entry point for this script -# ============================================================================= -main() { - check_bash_version - parse_args "$@" - - if [[ $PRINT_VERSION == "True" ]]; then - echo "flang version @FLANG_VERSION@" - exit 0 - fi - - # Source, object and library files provided by the user - local fortran_source_files=() - local other_source_files=() - local object_files=() - local lib_files=() - categorise_files INPUT_FILES fortran_source_files other_source_files object_files lib_files - - if [[ $PREPROCESS_ONLY == "True" ]]; then - preprocess fortran_source_files other_source_files OPTIONS - exit 0 - fi - - # Options for the Flang driver. - # NOTE: We need `-fc1` to make sure that the frontend driver rather than - # compiler driver is used. We also need to make sure that that's the first - # flag that the driver will see (otherwise it assumes compiler/toolchain - # driver mode). - local flang_options=("-fc1") - # Options for the external Fortran Compiler - local ext_fc_options=() - categorise_opts OPTIONS flang_options ext_fc_options - - local -r wd=$(cd "$(dirname "$0")/.." && pwd) - - # uuidgen is common but not installed by default on some distros - if ! command -v uuidgen &> /dev/null - then - echo "uuidgen is required for generating unparsed file names." - exit 1 - fi - - # STEP 1: Unparse - # Base-name for the unparsed files. These are just temporary files that are - # first generated and then deleted by this script. - # NOTE: We need to make sure that the base-name is unique to every - # invocation. Otherwise we can't use this script in parallel. - local -r unique_id=$(uuidgen | cut -b25-36) - local -r unparsed_file_base="flang_unparsed_file_$unique_id" - - flang_options+=("-module-suffix") - flang_options+=(".f18.mod") - flang_options+=("-fdebug-unparse") - flang_options+=("-fno-analyzed-objects-for-unparse") - - [[ ! -z ${MODULE_DIR} ]] && flang_options+=("-module-dir ${MODULE_DIR}") - [[ ! -z ${INTRINSICS_MOD_DIR} ]] && flang_options+=("-intrinsics-module-directory ${INTRINSICS_MOD_DIR}") - for idx in "${!fortran_source_files[@]}"; do - set +e - "$wd/bin/flang-new" "${flang_options[@]}" "${fortran_source_files[$idx]}" -o "${unparsed_file_base}_${idx}.f90" - ret_status=$? - set -e - if [[ $ret_status != 0 ]]; then - echo flang: in "$PWD", flang-new failed with exit status "$ret_status": "$wd/bin/flang-new" "${flang_options[@]}" "$@" >&2 - exit "$ret_status" - fi - done - - # STEP 2: Compile Fortran Source Files - local ext_fc="$(get_external_fc_name)" - # Temporary object files generated by this script. To be deleted at the end. - local temp_object_files=() - for idx in "${!fortran_source_files[@]}"; do - # We always have to specify the output name with `-o `. This - # is because we are using the unparsed rather than the original source file - # below. As a result, we cannot rely on the compiler-generated output name. - out_obj_file=$(get_relocatable_name "${fortran_source_files[$idx]}") - - set +e - $ext_fc "-c" "${ext_fc_options[@]}" "${unparsed_file_base}_${idx}.f90" "-o" "${out_obj_file}" - ret_status=$? - set -e - if [[ $ret_status != 0 ]]; then - echo flang: in "$PWD", "$ext_fc" failed with exit status "$ret_status": "$ext_fc" "${ext_fc_options[@]}" "$@" >&2 - exit "$ret_status" - fi - temp_object_files+=(${out_obj_file}) - done - - # Delete the unparsed files - for idx in "${!fortran_source_files[@]}"; do - rm "${unparsed_file_base}_${idx}.f90" - done - - # STEP 3: Compile Other Source Files - for idx in "${!other_source_files[@]}"; do - # We always specify the output name with `-o `. The user - # might have used `-o`, but we never add it to $OPTIONS (or - # $ext_fc_options). Hence we need to use `get_relocatable_name`. - out_obj_file=$(get_relocatable_name "${other_source_files[$idx]}") - - set +e - $ext_fc "-c" "${ext_fc_options[@]}" "${other_source_files[${idx}]}" "-o" "${out_obj_file}" - ret_status=$? - set -e - if [[ $ret_status != 0 ]]; then - echo flang: in "$PWD", "$ext_fc" failed with exit status "$ret_status": "$ext_fc" "${ext_fc_options[@]}" "$@" >&2 - exit "$ret_status" - fi - temp_object_files+=(${out_obj_file}) - done - - # STEP 4: Link - if [[ $COMPILE_ONLY == "True" ]]; then - exit 0; - fi - - if [[ ${#temp_object_files[@]} -ge 1 ]] || [[ ${#object_files[@]} -ge 1 ]] ; then - # If $OUTPUT_FILE was specified, use it for the output name. - if [[ ! -z ${OUTPUT_FILE:+x} ]]; then - output_definition="-o $OUTPUT_FILE" - else - output_definition="" - fi - - set +e - $ext_fc "${ext_fc_options[@]}" "${object_files[@]}" "${temp_object_files[@]}" "${lib_files[@]}" ${output_definition:+$output_definition} - ret_status=$? - set -e - if [[ $ret_status != 0 ]]; then - echo flang: in "$PWD", "$ext_fc" failed with exit status "$ret_status": "$ext_fc" "${ext_fc_options[@]}" "$@" >&2 - exit "$ret_status" - fi - fi - - # Delete intermediate object files - for idx in "${!fortran_source_files[@]}"; do - rm "${temp_object_files[$idx]}" - done -} - -main "${@}" diff --git a/libc/test/src/pthread/CMakeLists.txt b/libc/test/src/pthread/CMakeLists.txt index 51954a5babd2c5..4d01b667f12c83 100644 --- a/libc/test/src/pthread/CMakeLists.txt +++ b/libc/test/src/pthread/CMakeLists.txt @@ -47,9 +47,9 @@ add_libc_unittest( SRCS pthread_condattr_test.cpp DEPENDS - libc.include.errno + libc.include.llvm-libc-macros.generic_error_number_macros + libc.include.llvm-libc-macros.time_macros libc.include.pthread - libc.include.time libc.src.pthread.pthread_condattr_destroy libc.src.pthread.pthread_condattr_getclock libc.src.pthread.pthread_condattr_getpshared diff --git a/libc/test/src/pthread/pthread_condattr_test.cpp b/libc/test/src/pthread/pthread_condattr_test.cpp index accb62de92e45f..5fcdbd99cb0e20 100644 --- a/libc/test/src/pthread/pthread_condattr_test.cpp +++ b/libc/test/src/pthread/pthread_condattr_test.cpp @@ -6,16 +6,23 @@ // //===----------------------------------------------------------------------===// +#include "include/llvm-libc-macros/generic-error-number-macros.h" // EINVAL +#include "include/llvm-libc-macros/time-macros.h" // CLOCK_REALTIME, CLOCK_MONOTONIC +#include "src/pthread/pthread_condattr_destroy.h" +#include "src/pthread/pthread_condattr_getclock.h" +#include "src/pthread/pthread_condattr_getpshared.h" +#include "src/pthread/pthread_condattr_init.h" +#include "src/pthread/pthread_condattr_setclock.h" +#include "src/pthread/pthread_condattr_setpshared.h" #include "test/UnitTest/Test.h" -#include -#include -#include +// TODO: https://github.com/llvm/llvm-project/issues/88997 +#include // PTHREAD_PROCESS_PRIVATE, PTHREAD_PROCESS_SHARED TEST(LlvmLibcPThreadCondAttrTest, InitAndDestroy) { pthread_condattr_t cond; - ASSERT_EQ(pthread_condattr_init(&cond), 0); - ASSERT_EQ(pthread_condattr_destroy(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_init(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_destroy(&cond), 0); } TEST(LlvmLibcPThreadCondAttrTest, GetDefaultValues) { @@ -26,12 +33,12 @@ TEST(LlvmLibcPThreadCondAttrTest, GetDefaultValues) { // Invalid value. int pshared = 42; - ASSERT_EQ(pthread_condattr_init(&cond), 0); - ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_init(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getclock(&cond, &clock), 0); ASSERT_EQ(clock, CLOCK_REALTIME); - ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getpshared(&cond, &pshared), 0); ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE); - ASSERT_EQ(pthread_condattr_destroy(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_destroy(&cond), 0); } TEST(LlvmLibcPThreadCondAttrTest, SetGoodValues) { @@ -42,14 +49,17 @@ TEST(LlvmLibcPThreadCondAttrTest, SetGoodValues) { // Invalid value. int pshared = 42; - ASSERT_EQ(pthread_condattr_init(&cond), 0); - ASSERT_EQ(pthread_condattr_setclock(&cond, CLOCK_MONOTONIC), 0); - ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_init(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_setclock(&cond, CLOCK_MONOTONIC), + 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getclock(&cond, &clock), 0); ASSERT_EQ(clock, CLOCK_MONOTONIC); - ASSERT_EQ(pthread_condattr_setpshared(&cond, PTHREAD_PROCESS_SHARED), 0); - ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_setpshared(&cond, + PTHREAD_PROCESS_SHARED), + 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getpshared(&cond, &pshared), 0); ASSERT_EQ(pshared, PTHREAD_PROCESS_SHARED); - ASSERT_EQ(pthread_condattr_destroy(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_destroy(&cond), 0); } TEST(LlvmLibcPThreadCondAttrTest, SetBadValues) { @@ -60,12 +70,13 @@ TEST(LlvmLibcPThreadCondAttrTest, SetBadValues) { // Invalid value. int pshared = 42; - ASSERT_EQ(pthread_condattr_init(&cond), 0); - ASSERT_EQ(pthread_condattr_setclock(&cond, clock), EINVAL); - ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_init(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_setclock(&cond, clock), EINVAL); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getclock(&cond, &clock), 0); ASSERT_EQ(clock, CLOCK_REALTIME); - ASSERT_EQ(pthread_condattr_setpshared(&cond, pshared), EINVAL); - ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_setpshared(&cond, pshared), + EINVAL); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_getpshared(&cond, &pshared), 0); ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE); - ASSERT_EQ(pthread_condattr_destroy(&cond), 0); + ASSERT_EQ(LIBC_NAMESPACE::pthread_condattr_destroy(&cond), 0); } diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h index f7256db3bea661..881a4970822d8e 100644 --- a/libcxx/include/__chrono/convert_to_tm.h +++ b/libcxx/include/__chrono/convert_to_tm.h @@ -173,7 +173,7 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) { if (__value.hours().count() > std::numeric_limits::max()) std::__throw_format_error("Formatting hh_mm_ss, encountered an hour overflow"); __result.tm_hour = __value.hours().count(); -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) } else if constexpr (same_as<_ChronoT, chrono::sys_info>) { // Has no time information. } else if constexpr (same_as<_ChronoT, chrono::local_info>) { diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index 6a14c344fa1188..226fccbee6d133 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -205,7 +205,7 @@ struct _LIBCPP_HIDE_FROM_ABI __time_zone { template _LIBCPP_HIDE_FROM_ABI __time_zone __convert_to_time_zone([[maybe_unused]] const _Tp& __value) { -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) if constexpr (same_as<_Tp, chrono::sys_info>) return {__value.abbrev, __value.offset}; else @@ -417,7 +417,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_ok(const _Tp& __value) { return __value.weekday().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) else if constexpr (same_as<_Tp, chrono::sys_info>) return true; else if constexpr (same_as<_Tp, chrono::local_info>) @@ -463,7 +463,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_name_ok(const _Tp& __value) { return __value.weekday().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) else if constexpr (same_as<_Tp, chrono::sys_info>) return true; else if constexpr (same_as<_Tp, chrono::local_info>) @@ -509,7 +509,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __date_ok(const _Tp& __value) { return __value.ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) else if constexpr (same_as<_Tp, chrono::sys_info>) return true; else if constexpr (same_as<_Tp, chrono::local_info>) @@ -555,7 +555,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __month_name_ok(const _Tp& __value) { return __value.month().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) else if constexpr (same_as<_Tp, chrono::sys_info>) return true; else if constexpr (same_as<_Tp, chrono::local_info>) @@ -891,7 +891,7 @@ struct formatter, _CharT> : public __formatter_chron } }; -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) template <__fmt_char_type _CharT> struct formatter : public __formatter_chrono<_CharT> { public: @@ -913,7 +913,7 @@ struct formatter : public __formatter_chrono<_CharT> return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags{}); } }; -# endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__chrono/leap_second.h b/libcxx/include/__chrono/leap_second.h index 557abc15ff1847..2bbf0636467392 100644 --- a/libcxx/include/__chrono/leap_second.h +++ b/libcxx/include/__chrono/leap_second.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__chrono/duration.h> # include <__chrono/system_clock.h> @@ -121,6 +121,6 @@ _LIBCPP_HIDE_FROM_ABI constexpr auto operator<=>(const leap_second& __x, const s _LIBCPP_END_NAMESPACE_STD -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_LEAP_SECOND_H diff --git a/libcxx/include/__chrono/local_info.h b/libcxx/include/__chrono/local_info.h index b1a03ad7df2aca..cfe1448904d3f7 100644 --- a/libcxx/include/__chrono/local_info.h +++ b/libcxx/include/__chrono/local_info.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__chrono/sys_info.h> # include <__config> @@ -45,6 +45,6 @@ struct local_info { _LIBCPP_END_NAMESPACE_STD -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_LOCAL_INFO_H diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h index cb17dbea58bee3..ecf07a320c8b94 100644 --- a/libcxx/include/__chrono/ostream.h +++ b/libcxx/include/__chrono/ostream.h @@ -264,7 +264,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const hh_mm_ss<_Duration> __hms return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%T}"), __hms); } -# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) template _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& @@ -302,7 +302,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const local_info& __info) { _LIBCPP_STATICALLY_WIDEN(_CharT, "{}: {{{}, {}}}"), __result(), __info.first, __info.second); } -# endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) } // namespace chrono diff --git a/libcxx/include/__chrono/sys_info.h b/libcxx/include/__chrono/sys_info.h index 461d5322d413b3..11536cbde3a37c 100644 --- a/libcxx/include/__chrono/sys_info.h +++ b/libcxx/include/__chrono/sys_info.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__chrono/duration.h> # include <__chrono/system_clock.h> @@ -46,6 +46,6 @@ struct sys_info { _LIBCPP_END_NAMESPACE_STD -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_SYS_INFO_H diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h index 8e30034b799ad9..799602c1cdbaf0 100644 --- a/libcxx/include/__chrono/time_zone.h +++ b/libcxx/include/__chrono/time_zone.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__chrono/duration.h> # include <__chrono/sys_info.h> @@ -92,6 +92,6 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_TIME_ZONE_H diff --git a/libcxx/include/__chrono/time_zone_link.h b/libcxx/include/__chrono/time_zone_link.h index c76ddeff9f966d..f44137829a8145 100644 --- a/libcxx/include/__chrono/time_zone_link.h +++ b/libcxx/include/__chrono/time_zone_link.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__compare/strong_order.h> # include <__config> @@ -74,6 +74,6 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_TIME_ZONE_LINK_H diff --git a/libcxx/include/__chrono/tzdb.h b/libcxx/include/__chrono/tzdb.h index e0bfedf0d78239..12fe6ccb63f94c 100644 --- a/libcxx/include/__chrono/tzdb.h +++ b/libcxx/include/__chrono/tzdb.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__algorithm/ranges_lower_bound.h> # include <__chrono/leap_second.h> @@ -89,6 +89,6 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_TZDB_H diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h index 693899d372112d..ae27067dbf02c3 100644 --- a/libcxx/include/__chrono/tzdb_list.h +++ b/libcxx/include/__chrono/tzdb_list.h @@ -14,7 +14,7 @@ #include // Enable the contents of the header only when libc++ was built with experimental features enabled. -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) # include <__availability> # include <__chrono/time_zone.h> @@ -105,6 +105,6 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_AVAILABILITY_TZDB _LIBCPP_EXPORTED_FROM_ABI string _LIBCPP_END_NAMESPACE_STD -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) #endif // _LIBCPP___CHRONO_TZDB_LIST_H diff --git a/libcxx/include/__config b/libcxx/include/__config index 4ccef2ca0d73b4..4f5c1476626de2 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -421,7 +421,7 @@ _LIBCPP_HARDENING_MODE_DEBUG # if !defined(_LIBCPP_ENABLE_EXPERIMENTAL) && !defined(_LIBCPP_BUILDING_LIBRARY) # define _LIBCPP_HAS_NO_INCOMPLETE_PSTL # define _LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN -# define _LIBCPP_HAS_NO_INCOMPLETE_TZDB +# define _LIBCPP_HAS_NO_EXPERIMENTAL_TZDB # define _LIBCPP_HAS_NO_EXPERIMENTAL_SYNCSTREAM # endif diff --git a/libcxx/include/__memory/uses_allocator_construction.h b/libcxx/include/__memory/uses_allocator_construction.h index 9b7262bec5cf8b..5e5819d4c281e4 100644 --- a/libcxx/include/__memory/uses_allocator_construction.h +++ b/libcxx/include/__memory/uses_allocator_construction.h @@ -184,20 +184,18 @@ __uses_allocator_construction_args(const _Alloc& __alloc, _Type&& __value) noexc struct __pair_constructor { using _PairMutable = remove_cv_t<_Pair>; - _LIBCPP_HIDE_FROM_ABI constexpr auto __do_construct(const _PairMutable& __pair) const { + _LIBCPP_HIDDEN constexpr auto __do_construct(const _PairMutable& __pair) const { return std::__make_obj_using_allocator<_PairMutable>(__alloc_, __pair); } - _LIBCPP_HIDE_FROM_ABI constexpr auto __do_construct(_PairMutable&& __pair) const { + _LIBCPP_HIDDEN constexpr auto __do_construct(_PairMutable&& __pair) const { return std::__make_obj_using_allocator<_PairMutable>(__alloc_, std::move(__pair)); } const _Alloc& __alloc_; _Type& __value_; - _LIBCPP_HIDE_FROM_ABI constexpr operator _PairMutable() const { - return __do_construct(std::forward<_Type>(this->__value_)); - } + _LIBCPP_HIDDEN constexpr operator _PairMutable() const { return __do_construct(std::forward<_Type>(__value_)); } }; return std::make_tuple(__pair_constructor{__alloc, __value}); diff --git a/libcxx/include/__random/linear_congruential_engine.h b/libcxx/include/__random/linear_congruential_engine.h index fe9cb909b74d21..9d77649e9cfc8e 100644 --- a/libcxx/include/__random/linear_congruential_engine.h +++ b/libcxx/include/__random/linear_congruential_engine.h @@ -26,32 +26,60 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD +enum __lce_alg_type { + _LCE_Full, + _LCE_Part, + _LCE_Schrage, + _LCE_Promote, +}; + template (_Mp - __c) / __a), - bool _OverflowOK = ((__m & (__m - 1)) == 0ull), // m = 2^n - bool _SchrageOK = (__a != 0 && __m != 0 && __m % __a <= __m / __a)> // r <= q + bool _HasOverflow = (__a != 0ull && (__m & (__m - 1ull)) != 0ull), // a != 0, m != 0, m != 2^n + bool _Full = (!_HasOverflow || __m - 1ull <= (_Mp - __c) / __a), // (a * x + c) % m works + bool _Part = (!_HasOverflow || __m - 1ull <= _Mp / __a), // (a * x) % m works + bool _Schrage = (_HasOverflow && __m % __a <= __m / __a)> // r <= q struct __lce_alg_picker { - static_assert(!_MightOverflow || _OverflowOK || _SchrageOK, - "The current values of a, c, and m cannot generate a number " - "within bounds of linear_congruential_engine."); - - static _LIBCPP_CONSTEXPR const bool __use_schrage = _MightOverflow && !_OverflowOK && _SchrageOK; + static _LIBCPP_CONSTEXPR const __lce_alg_type __mode = + _Full ? _LCE_Full + : _Part ? _LCE_Part + : _Schrage ? _LCE_Schrage + : _LCE_Promote; + +#ifdef _LIBCPP_HAS_NO_INT128 + static_assert(_Mp != (unsigned long long)(-1) || _Full || _Part || _Schrage, + "The current values for a, c, and m are not currently supported on platforms without __int128"); +#endif }; template ::__use_schrage> + __lce_alg_type _Mode = __lce_alg_picker<__a, __c, __m, _Mp>::__mode> struct __lce_ta; // 64 +#ifndef _LIBCPP_HAS_NO_INT128 +template +struct __lce_ta<_Ap, _Cp, _Mp, (unsigned long long)(-1), _LCE_Promote> { + typedef unsigned long long result_type; + _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __xp) { + __extension__ using __calc_type = unsigned __int128; + const __calc_type __a = static_cast<__calc_type>(_Ap); + const __calc_type __c = static_cast<__calc_type>(_Cp); + const __calc_type __m = static_cast<__calc_type>(_Mp); + const __calc_type __x = static_cast<__calc_type>(__xp); + return static_cast((__a * __x + __c) % __m); + } +}; +#endif + template -struct __lce_ta<__a, __c, __m, (unsigned long long)(~0), true> { +struct __lce_ta<__a, __c, __m, (unsigned long long)(-1), _LCE_Schrage> { typedef unsigned long long result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { // Schrage's algorithm @@ -66,7 +94,7 @@ struct __lce_ta<__a, __c, __m, (unsigned long long)(~0), true> { }; template -struct __lce_ta<__a, 0, __m, (unsigned long long)(~0), true> { +struct __lce_ta<__a, 0ull, __m, (unsigned long long)(-1), _LCE_Schrage> { typedef unsigned long long result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { // Schrage's algorithm @@ -80,21 +108,40 @@ struct __lce_ta<__a, 0, __m, (unsigned long long)(~0), true> { }; template -struct __lce_ta<__a, __c, __m, (unsigned long long)(~0), false> { +struct __lce_ta<__a, __c, __m, (unsigned long long)(-1), _LCE_Part> { + typedef unsigned long long result_type; + _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { + // Use (((a*x) % m) + c) % m + __x = (__a * __x) % __m; + __x += __c - (__x >= __m - __c) * __m; + return __x; + } +}; + +template +struct __lce_ta<__a, __c, __m, (unsigned long long)(-1), _LCE_Full> { typedef unsigned long long result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { return (__a * __x + __c) % __m; } }; template -struct __lce_ta<__a, __c, 0, (unsigned long long)(~0), false> { +struct __lce_ta<__a, __c, 0ull, (unsigned long long)(-1), _LCE_Full> { typedef unsigned long long result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { return __a * __x + __c; } }; // 32 +template +struct __lce_ta<__a, __c, __m, unsigned(-1), _LCE_Promote> { + typedef unsigned result_type; + _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { + return static_cast(__lce_ta<__a, __c, __m, (unsigned long long)(-1)>::next(__x)); + } +}; + template -struct __lce_ta<_Ap, _Cp, _Mp, unsigned(~0), true> { +struct __lce_ta<_Ap, _Cp, _Mp, unsigned(-1), _LCE_Schrage> { typedef unsigned result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { const result_type __a = static_cast(_Ap); @@ -112,7 +159,7 @@ struct __lce_ta<_Ap, _Cp, _Mp, unsigned(~0), true> { }; template -struct __lce_ta<_Ap, 0, _Mp, unsigned(~0), true> { +struct __lce_ta<_Ap, 0ull, _Mp, unsigned(-1), _LCE_Schrage> { typedef unsigned result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { const result_type __a = static_cast(_Ap); @@ -128,7 +175,21 @@ struct __lce_ta<_Ap, 0, _Mp, unsigned(~0), true> { }; template -struct __lce_ta<_Ap, _Cp, _Mp, unsigned(~0), false> { +struct __lce_ta<_Ap, _Cp, _Mp, unsigned(-1), _LCE_Part> { + typedef unsigned result_type; + _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { + const result_type __a = static_cast(_Ap); + const result_type __c = static_cast(_Cp); + const result_type __m = static_cast(_Mp); + // Use (((a*x) % m) + c) % m + __x = (__a * __x) % __m; + __x += __c - (__x >= __m - __c) * __m; + return __x; + } +}; + +template +struct __lce_ta<_Ap, _Cp, _Mp, unsigned(-1), _LCE_Full> { typedef unsigned result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { const result_type __a = static_cast(_Ap); @@ -139,7 +200,7 @@ struct __lce_ta<_Ap, _Cp, _Mp, unsigned(~0), false> { }; template -struct __lce_ta<_Ap, _Cp, 0, unsigned(~0), false> { +struct __lce_ta<_Ap, _Cp, 0ull, unsigned(-1), _LCE_Full> { typedef unsigned result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { const result_type __a = static_cast(_Ap); @@ -150,11 +211,11 @@ struct __lce_ta<_Ap, _Cp, 0, unsigned(~0), false> { // 16 -template -struct __lce_ta<__a, __c, __m, (unsigned short)(~0), __b> { +template +struct __lce_ta<__a, __c, __m, (unsigned short)(-1), __mode> { typedef unsigned short result_type; _LIBCPP_HIDE_FROM_ABI static result_type next(result_type __x) { - return static_cast(__lce_ta<__a, __c, __m, unsigned(~0)>::next(__x)); + return static_cast(__lce_ta<__a, __c, __m, unsigned(-1)>::next(__x)); } }; @@ -178,7 +239,7 @@ class _LIBCPP_TEMPLATE_VIS linear_congruential_engine { private: result_type __x_; - static _LIBCPP_CONSTEXPR const result_type _Mp = result_type(~0); + static _LIBCPP_CONSTEXPR const result_type _Mp = result_type(-1); static_assert(__m == 0 || __a < __m, "linear_congruential_engine invalid parameters"); static_assert(__m == 0 || __c < __m, "linear_congruential_engine invalid parameters"); diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h index 47ed1057caaab1..1fd22d518e1ab6 100644 --- a/libcxx/include/__string/char_traits.h +++ b/libcxx/include/__string/char_traits.h @@ -10,6 +10,7 @@ #define _LIBCPP___STRING_CHAR_TRAITS_H #include <__algorithm/fill_n.h> +#include <__algorithm/find.h> #include <__algorithm/find_end.h> #include <__algorithm/find_first_of.h> #include <__algorithm/min.h> @@ -17,6 +18,7 @@ #include <__compare/ordering.h> #include <__config> #include <__functional/hash.h> +#include <__functional/identity.h> #include <__iterator/iterator_traits.h> #include <__string/constexpr_c_functions.h> #include <__type_traits/is_constant_evaluated.h> @@ -272,10 +274,14 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { return std::__constexpr_memcmp(__s1, __s2, __element_count(__n)); } - static _LIBCPP_HIDE_FROM_ABI constexpr size_t length(const char_type* __s) _NOEXCEPT; + static _LIBCPP_HIDE_FROM_ABI constexpr size_t length(const char_type* __str) _NOEXCEPT { + return std::__constexpr_strlen(__str); + } _LIBCPP_HIDE_FROM_ABI static constexpr const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT; + find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + return std::__constexpr_memchr(__s, __a, __n); + } static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { @@ -307,25 +313,6 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { static inline _LIBCPP_HIDE_FROM_ABI constexpr int_type eof() noexcept { return int_type(EOF); } }; -// TODO use '__builtin_strlen' if it ever supports char8_t ?? -inline constexpr size_t char_traits::length(const char_type* __s) _NOEXCEPT { - size_t __len = 0; - for (; !eq(*__s, char_type(0)); ++__s) - ++__len; - return __len; -} - -// TODO use '__builtin_char_memchr' if it ever supports char8_t ?? -inline constexpr const char8_t* -char_traits::find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { - for (; __n; --__n) { - if (eq(*__s, __a)) - return __s; - ++__s; - } - return nullptr; -} - #endif // _LIBCPP_HAS_NO_CHAR8_T template <> @@ -353,9 +340,15 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT; - _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* + find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + __identity __proj; + const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj); + if (__match == __s + __n) + return nullptr; + return __match; + } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { return std::__constexpr_memmove(__s1, __s2, __element_count(__n)); @@ -408,16 +401,6 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const return __len; } -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 const char16_t* -char_traits::find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { - for (; __n; --__n) { - if (eq(*__s, __a)) - return __s; - ++__s; - } - return nullptr; -} - template <> struct _LIBCPP_TEMPLATE_VIS char_traits { using char_type = char32_t; @@ -443,8 +426,15 @@ struct _LIBCPP_TEMPLATE_VIS char_traits { _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type* - find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT; + find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { + __identity __proj; + const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj); + if (__match == __s + __n) + return nullptr; + return __match; + } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type* move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT { @@ -496,16 +486,6 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits::length(const return __len; } -inline _LIBCPP_CONSTEXPR_SINCE_CXX17 const char32_t* -char_traits::find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT { - for (; __n; --__n) { - if (eq(*__s, __a)) - return __s; - ++__s; - } - return nullptr; -} - // helper fns for basic_string and string_view // __str_find diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h index 198f0f5e680914..72c6ce69b60bb6 100644 --- a/libcxx/include/__string/constexpr_c_functions.h +++ b/libcxx/include/__string/constexpr_c_functions.h @@ -35,18 +35,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD // of elements as opposed to a number of bytes. enum class __element_count : size_t {}; -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_strlen(const char* __str) { +template +inline const bool __is_char_type = false; + +template <> +inline const bool __is_char_type = true; + +#ifndef _LIBCPP_HAS_NO_CHAR8_T +template <> +inline const bool __is_char_type = true; +#endif + +template +inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_strlen(const _Tp* __str) _NOEXCEPT { + static_assert(__is_char_type<_Tp>, "__constexpr_strlen only works with char and char8_t"); // GCC currently doesn't support __builtin_strlen for heap-allocated memory during constant evaluation. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70816 -#ifdef _LIBCPP_COMPILER_GCC if (__libcpp_is_constant_evaluated()) { +#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_COMPILER_CLANG_BASED) + if constexpr (is_same_v<_Tp, char>) + return __builtin_strlen(__str); +#endif size_t __i = 0; for (; __str[__i] != '\0'; ++__i) ; return __i; } -#endif - return __builtin_strlen(__str); + return __builtin_strlen(reinterpret_cast(__str)); } // Because of __libcpp_is_trivially_lexicographically_comparable we know that comparing the object representations is diff --git a/libcxx/include/sstream b/libcxx/include/sstream index 5873deb8318ecf..003c802b2647de 100644 --- a/libcxx/include/sstream +++ b/libcxx/include/sstream @@ -330,14 +330,6 @@ typedef basic_stringstream wstringstream; _LIBCPP_PUSH_MACROS #include <__undef_macros> -// TODO(LLVM-19): Remove this once we drop support for Clang 16, -// which had this bug: https://github.com/llvm/llvm-project/issues/40363 -#ifdef _WIN32 -# define _LIBCPP_HIDE_FROM_ABI_SSTREAM _LIBCPP_ALWAYS_INLINE -#else -# define _LIBCPP_HIDE_FROM_ABI_SSTREAM _LIBCPP_HIDE_FROM_ABI -#endif - _LIBCPP_BEGIN_NAMESPACE_STD // Class template basic_stringbuf [stringbuf] @@ -460,9 +452,9 @@ public: #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY) string_type str() const; #else - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const& { return str(__str_.get_allocator()); } + _LIBCPP_HIDE_FROM_ABI string_type str() const& { return str(__str_.get_allocator()); } - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { + _LIBCPP_HIDE_FROM_ABI string_type str() && { const basic_string_view<_CharT, _Traits> __view = view(); typename string_type::size_type __pos = __view.empty() ? 0 : __view.data() - __str_.data(); // In C++23, this is just string_type(std::move(__str_), __pos, __view.size(), __str_.get_allocator()); @@ -948,9 +940,9 @@ public: #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); } #else - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const& { return __sb_.str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() const& { return __sb_.str(); } - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); } #endif #if _LIBCPP_STD_VER >= 20 @@ -1085,9 +1077,9 @@ public: #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); } #else - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const& { return __sb_.str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() const& { return __sb_.str(); } - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); } #endif #if _LIBCPP_STD_VER >= 20 @@ -1225,9 +1217,9 @@ public: #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_HIDE_FROM_ABI string_type str() const { return __sb_.str(); } #else - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() const& { return __sb_.str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() const& { return __sb_.str(); } - _LIBCPP_HIDE_FROM_ABI_SSTREAM string_type str() && { return std::move(__sb_).str(); } + _LIBCPP_HIDE_FROM_ABI string_type str() && { return std::move(__sb_).str(); } #endif #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/variant b/libcxx/include/variant index 1b5e84e9547953..858a49b980bd9a 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -909,8 +909,8 @@ protected: __a.__value = std::forward<_Arg>(__arg); } else { struct { - _LIBCPP_HIDE_FROM_ABI void operator()(true_type) const { __this->__emplace<_Ip>(std::forward<_Arg>(__arg)); } - _LIBCPP_HIDE_FROM_ABI void operator()(false_type) const { + _LIBCPP_HIDDEN void operator()(true_type) const { __this->__emplace<_Ip>(std::forward<_Arg>(__arg)); } + _LIBCPP_HIDDEN void operator()(false_type) const { __this->__emplace<_Ip>(_Tp(std::forward<_Arg>(__arg))); } __assignment* __this; diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp index cbdb2ab1758e30..f0ea6a8f2c7783 100644 --- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp +++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.compile.pass.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp index e88c176af4a8ba..a5ce5d16581306 100644 --- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard_extensions.verify.cpp @@ -11,7 +11,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp index 7c98ff1c1d566c..3d50d2347d6bb8 100644 --- a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp +++ b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp @@ -24,7 +24,7 @@ # error "-fexperimental-library should enable the stop_token" #endif -#ifdef _LIBCPP_HAS_NO_INCOMPLETE_TZDB +#ifdef _LIBCPP_HAS_NO_EXPERIMENTAL_TZDB # error "-fexperimental-library should enable the chrono TZDB" #endif diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp index 282bddcf9adb14..25a0f00003da2f 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/leap_seconds.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/links.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/links.pass.cpp index 92d761d46bccef..9bace25629f727 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/links.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/links.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp index fcfc34625fbece..73f4dbd59af9ae 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/rules.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.list/erase_after.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.list/erase_after.pass.cpp index 9b6e2776fe139b..92842800f6bbd5 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.list/erase_after.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.list/erase_after.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp index 94c403dbe3967a..5da4c7eea11b4b 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index 971f7f04c49a8a..3ee213358f3524 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp index 0f0095a71b99b2..b4f32a1b6fd785 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/version.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/zones.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/zones.pass.cpp index e97b36fca2bb62..6d436d61357b39 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.db/zones.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/zones.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp index d18076a642ec6a..b3fbaaf30aaec9 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp @@ -12,7 +12,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp index faa7d855c8de7d..6b41c7bdf2344f 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp @@ -12,7 +12,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp index 194f58215b925f..7f08c64d5e0e71 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.rule_selection.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.rule_selection.pass.cpp index accd5bcdc89e26..33c5d0499bca80 100644 --- a/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.rule_selection.pass.cpp +++ b/libcxx/test/libcxx/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.rule_selection.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py index 28f223c422a9b0..e4e1d3f232c12c 100644 --- a/libcxx/test/libcxx/transitive_includes.gen.py +++ b/libcxx/test/libcxx/transitive_includes.gen.py @@ -64,7 +64,7 @@ {lit_header_restrictions.get(header, '')} // TODO: Fix this test to make it work with localization or wide characters disabled -// UNSUPPORTED{BLOCKLIT}: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-incomplete-tzdb, no-tzdb +// UNSUPPORTED{BLOCKLIT}: no-localization, no-wide-characters, no-threads, no-filesystem, libcpp-has-no-experimental-tzdb, no-tzdb // When built with modules, this test doesn't work because --trace-includes doesn't // report the stack of includes correctly. diff --git a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/alg.pass.cpp b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/alg.pass.cpp index 8a9cae0e610c35..159cb19f65468b 100644 --- a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/alg.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/alg.pass.cpp @@ -38,12 +38,12 @@ int main(int, char**) // m might overflow. The overflow is not OK and result will be in bounds // so we should use Schrage's algorithm - typedef std::linear_congruential_engine E2; + typedef std::linear_congruential_engine E2; E2 e2; // make sure Schrage's algorithm is used (it would be 0s after the first otherwise) assert(e2() == (1ull << 32)); assert(e2() == (1ull << 63) - 1ull); - assert(e2() == (1ull << 63) - (1ull << 33) + 1ull); + assert(e2() == (1ull << 63) - 0x1ffffffffull); // make sure result is in bounds assert(e2() < (1ull << 63) + 1); assert(e2() < (1ull << 63) + 1); @@ -56,9 +56,9 @@ int main(int, char**) typedef std::linear_congruential_engine E3; E3 e3; // make sure Schrage's algorithm is used - assert(e3() == 402727752ull); - assert(e3() == 162159612030764687ull); - assert(e3() == 108176466184989142ull); + assert(e3() == 0x18012348ull); + assert(e3() == 0x2401b4ed802468full); + assert(e3() == 0x18051ec400369d6ull); // make sure result is in bounds assert(e3() < (3ull << 56)); assert(e3() < (3ull << 56)); @@ -66,19 +66,52 @@ int main(int, char**) assert(e3() < (3ull << 56)); assert(e3() < (3ull << 56)); - // m will not overflow so we should not use Schrage's algorithm - typedef std::linear_congruential_engine E4; + // 32-bit case: + // m might overflow. The overflow is not OK, result will be in bounds, + // and Schrage's algorithm is incompatible here. Need to use 64 bit arithmetic. + typedef std::linear_congruential_engine E4; E4 e4; + // make sure enough precision is used + assert(e4() == 0x10009u); + assert(e4() == 0x120053u); + assert(e4() == 0xf5030fu); + // make sure result is in bounds + assert(e4() < 0x7fffffffu); + assert(e4() < 0x7fffffffu); + assert(e4() < 0x7fffffffu); + assert(e4() < 0x7fffffffu); + assert(e4() < 0x7fffffffu); + +#ifndef _LIBCPP_HAS_NO_INT128 + // m might overflow. The overflow is not OK, result will be in bounds, + // and Schrage's algorithm is incompatible here. Need to use 128 bit arithmetic. + typedef std::linear_congruential_engine E5; + E5 e5; + // make sure enough precision is used + assert(e5() == 0x100000001ull); + assert(e5() == 0x200000009ull); + assert(e5() == 0xb00000019ull); + // make sure result is in bounds + assert(e5() < (1ull << 61) - 1ull); + assert(e5() < (1ull << 61) - 1ull); + assert(e5() < (1ull << 61) - 1ull); + assert(e5() < (1ull << 61) - 1ull); + assert(e5() < (1ull << 61) - 1ull); +#endif + + // m will not overflow so we should not use Schrage's algorithm + typedef std::linear_congruential_engine E6; + E6 e6; // make sure the correct algorithm was used - assert(e4() == 2ull); - assert(e4() == 3ull); - assert(e4() == 4ull); + assert(e6() == 2ull); + assert(e6() == 3ull); + assert(e6() == 4ull); // make sure result is in bounds - assert(e4() < (1ull << 48)); - assert(e4() < (1ull << 48)); - assert(e4() < (1ull << 48)); - assert(e4() < (1ull << 48)); - assert(e4() < (1ull << 48)); + assert(e6() < (1ull << 48)); + assert(e6() < (1ull << 48)); + assert(e6() < (1ull << 48)); + assert(e6() < (1ull << 48)); + assert(e6() < (1ull << 48)); return 0; -} \ No newline at end of file +} diff --git a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/assign.pass.cpp b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/assign.pass.cpp index 5317f171a98a79..73829071bd9580 100644 --- a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/assign.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/assign.pass.cpp @@ -61,24 +61,34 @@ test() test1(); test1(); test1(); +} + +template +void test_ext() { + const T M(static_cast(-1)); - /* - // Cases where m is odd and m % a > m / a (not implemented) + // Cases where m is odd and m % a > m / a test1(); test1(); test1(); test1(); test1(); test1(); - */ } int main(int, char**) { test(); + test_ext(); test(); + test_ext(); test(); + test_ext(); test(); + // This isn't implemented on platforms without __int128 +#ifndef _LIBCPP_HAS_NO_INT128 + test_ext(); +#endif - return 0; + return 0; } diff --git a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/copy.pass.cpp b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/copy.pass.cpp index 8e950043d594f9..8387a1763714f0 100644 --- a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/copy.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/copy.pass.cpp @@ -60,24 +60,34 @@ test() test1(); test1(); test1(); +} + +template +void test_ext() { + const T M(static_cast(-1)); - /* - // Cases where m is odd and m % a > m / a (not implemented) + // Cases where m is odd and m % a > m / a test1(); test1(); test1(); test1(); test1(); test1(); - */ } int main(int, char**) { test(); + test_ext(); test(); + test_ext(); test(); + test_ext(); test(); + // This isn't implemented on platforms without __int128 +#ifndef _LIBCPP_HAS_NO_INT128 + test_ext(); +#endif - return 0; + return 0; } diff --git a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/default.pass.cpp b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/default.pass.cpp index 52126f7a200dbe..c59afd7a3eb273 100644 --- a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/default.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/default.pass.cpp @@ -58,24 +58,34 @@ test() test1(); test1(); test1(); +} + +template +void test_ext() { + const T M(static_cast(-1)); - /* - // Cases where m is odd and m % a > m / a (not implemented) + // Cases where m is odd and m % a > m / a test1(); test1(); test1(); test1(); test1(); test1(); - */ } int main(int, char**) { test(); + test_ext(); test(); + test_ext(); test(); + test_ext(); test(); + // This isn't implemented on platforms without __int128 +#ifndef _LIBCPP_HAS_NO_INT128 + test_ext(); +#endif - return 0; + return 0; } diff --git a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/values.pass.cpp b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/values.pass.cpp index 28d8dfea01fab3..98b07e70f247af 100644 --- a/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/values.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.eng/rand.eng.lcong/values.pass.cpp @@ -91,24 +91,34 @@ test() test1(); test1(); test1(); +} - /* - // Cases where m is odd and m % a > m / a (not implemented) +template +void test_ext() { + const T M(static_cast(-1)); + + // Cases where m is odd and m % a > m / a test1(); test1(); test1(); test1(); test1(); test1(); - */ } int main(int, char**) { test(); + test_ext(); test(); + test_ext(); test(); + test_ext(); test(); + // This isn't implemented on platforms without __int128 +#ifndef _LIBCPP_HAS_NO_INT128 + test_ext(); +#endif - return 0; + return 0; } diff --git a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp index ef36416c3fb232..019a1fc4732758 100644 --- a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp @@ -12,7 +12,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // REQUIRES: locale.fr_FR.UTF-8 diff --git a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp index 0a41424cfdcb83..d57977413b696c 100644 --- a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp @@ -11,7 +11,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // REQUIRES: locale.fr_FR.UTF-8 // REQUIRES: locale.ja_JP.UTF-8 diff --git a/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp index 4fcdf6fab9977c..f873ad3167819e 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/leap_seconds.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp index d85c8ba52622a0..2c43e121613c77 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp index 470a722d0b69c6..6780bbff368254 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp index a5579a3820b6ac..4eadbf495f8122 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp index c3142a86bf9d65..4d600fcdf40e3f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/erase_after.compile.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/erase_after.compile.pass.cpp index b6844786b489a2..db276147773b5c 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/erase_after.compile.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/erase_after.compile.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp index 12c5310772f6a9..01167365d9507a 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp index b00b8b44188d0b..6759a01554500b 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/types.compile.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/types.compile.pass.cpp index 2c2e3a8274a47a..774037fa263c02 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/types.compile.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/types.compile.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp index af38772ee3cb28..5e1dd478d34b6f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp index 36b68cefc8d31a..554f259eb5c2af 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp index 7b4218cc8421b4..e6497e26323ce6 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp index 12987f6c89d808..f929dafcc96838 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/tzdb.members.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/tzdb.members.pass.cpp index af95274e33a811..5d31cbfb8b22f1 100644 --- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/tzdb.members.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/tzdb.members.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/local_info.members.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/local_info.members.pass.cpp index 4e2a757a0a970f..f246d5954d637f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/local_info.members.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/local_info.members.pass.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp index d9bf066b068783..e144add4642727 100644 --- a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.local/ostream.pass.cpp @@ -12,7 +12,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp index 82c4844b423c5a..18fab6ee56e5ef 100644 --- a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp @@ -12,7 +12,7 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/sys_info.members.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/sys_info.members.pass.cpp index 2510792c2280b1..bfbe9b90ada64c 100644 --- a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/sys_info.members.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/sys_info.members.pass.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // diff --git a/libcxx/test/std/time/time.zone/time.zone.leap/assign.copy.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.leap/assign.copy.pass.cpp index 6918ed6be5c144..7ea692dde8a259 100644 --- a/libcxx/test/std/time/time.zone/time.zone.leap/assign.copy.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.leap/assign.copy.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.leap/cons.copy.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.leap/cons.copy.pass.cpp index 3dad08968d12a2..79ed0860811588 100644 --- a/libcxx/test/std/time/time.zone/time.zone.leap/cons.copy.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.leap/cons.copy.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.leap/members/date.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.leap/members/date.pass.cpp index 6f9fe1c47d3513..bd148689f96f07 100644 --- a/libcxx/test/std/time/time.zone/time.zone.leap/members/date.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.leap/members/date.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.leap/members/value.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.leap/members/value.pass.cpp index 652e51ef0bf105..c6181ee7df9f58 100644 --- a/libcxx/test/std/time/time.zone/time.zone.leap/members/value.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.leap/members/value.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.leap/nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.leap/nonmembers/comparison.pass.cpp index bf6855ea63dfc1..448cd88d146f9f 100644 --- a/libcxx/test/std/time/time.zone/time.zone.leap/nonmembers/comparison.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.leap/nonmembers/comparison.pass.cpp @@ -12,7 +12,7 @@ // TODO TZDB test whether this can be enabled with gcc 14. // UNSUPPORTED: gcc-13 -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp index c2412bac461bc5..593675cfba4598 100644 --- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp index 2f8b5b9421d63c..aa97fdadcebcc1 100644 --- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp index 944818c1ad0c16..22c270f24d1897 100644 --- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.link/types.compile.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/types.compile.pass.cpp index 2e4f178c7c0504..268e78c9dcad3d 100644 --- a/libcxx/test/std/time/time.zone/time.zone.link/types.compile.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.link/types.compile.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp index d27cf0bd89062e..25d2ff11d09341 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23, c++26 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp index d1ff2fe683108c..b9f47382927977 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp index 05328e2256c79c..50b7bdb568d437 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/sys_info.zdump.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb, has-no-zdump -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // TODO TZDB Investigate diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp index 7c680707bc518b..e362b183fca026 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/types.compile.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/types.compile.pass.cpp index 915d5596bf22b0..926ecef367016e 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/types.compile.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/types.compile.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb -// XFAIL: libcpp-has-no-incomplete-tzdb +// XFAIL: libcpp-has-no-experimental-tzdb // XFAIL: availability-tzdb-missing // diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp index fd3416f694ba7c..94a1ea576bda0f 100644 --- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp +++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp @@ -176,12 +176,12 @@ void test_P1361() { assert_is_formattable, CharT>(); -# if !defined(TEST_HAS_NO_INCOMPLETE_TZDB) +# if !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) assert_is_formattable(); assert_is_formattable(); //assert_is_formattable(); -# endif // !defined(TEST_HAS_NO_INCOMPLETE_TZDB) +# endif // !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) #endif // TEST_HAS_NO_LOCALIZATION } diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index fe9207d7e59690..68dd591cb57558 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -417,8 +417,8 @@ inline Tp const& DoNotOptimize(Tp const& value) { # define TEST_HAS_NO_RANDOM_DEVICE #endif -#if defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) -# define TEST_HAS_NO_INCOMPLETE_TZDB +#if defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB) +# define TEST_HAS_NO_EXPERIMENTAL_TZDB #endif #if defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index 5e42562ed5db52..c2d294e49f4884 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -331,7 +331,7 @@ def getSuitableClangTidy(cfg): else [ AddFeature("libcpp-has-no-incomplete-pstl"), AddFeature("libcpp-has-no-experimental-stop_token"), - AddFeature("libcpp-has-no-incomplete-tzdb"), + AddFeature("libcpp-has-no-experimental-tzdb"), AddFeature("libcpp-has-no-experimental-syncstream"), ], ), diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 5fffdc51f34ddf..a5b47f020f8726 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1856,8 +1856,9 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { std::vector> stack; // Iterate over argv to process input files and positional arguments. + std::optional defaultScript; InputFile::isInGroup = false; - bool hasInput = false; + bool hasInput = false, hasScript = false; for (auto *arg : args) { switch (arg->getOption().getID()) { case OPT_library: @@ -1879,9 +1880,16 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { break; } case OPT_script: + case OPT_default_script: if (std::optional path = searchScript(arg->getValue())) { - if (std::optional mb = readFile(*path)) - readLinkerScript(*mb); + if (std::optional mb = readFile(*path)) { + if (arg->getOption().matches(OPT_default_script)) { + defaultScript = mb; + } else { + readLinkerScript(*mb); + hasScript = true; + } + } break; } error(Twine("cannot find linker script ") + arg->getValue()); @@ -1961,6 +1969,8 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { } } + if (defaultScript && !hasScript) + readLinkerScript(*defaultScript); if (files.empty() && !hasInput && errorCount() == 0) error("no input files"); } diff --git a/lld/ELF/DriverUtils.cpp b/lld/ELF/DriverUtils.cpp index de8ffab0e21249..ac74604408152d 100644 --- a/lld/ELF/DriverUtils.cpp +++ b/lld/ELF/DriverUtils.cpp @@ -186,6 +186,7 @@ std::string elf::createResponseFile(const opt::InputArgList &args) { os << arg->getSpelling() << quote(rewritePath(arg->getValue())) << "\n"; break; case OPT_call_graph_ordering_file: + case OPT_default_script: case OPT_dynamic_list: case OPT_export_dynamic_symbol_list: case OPT_just_symbols: diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index d470646ed0eeef..72eaf157a181cf 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -157,6 +157,8 @@ defm debug_names: BB<"debug-names", "Generate a merged .debug_names section", "Do not generate a merged .debug_names section (default)">; +defm default_script: EEq<"default-script", "In the absence of --script, read this default linker script">; + defm demangle: B<"demangle", "Demangle symbol names (default)", "Do not demangle symbol names">; @@ -555,6 +557,7 @@ HelpText<"Format diagnostics for Visual Studio compatibility">; def package_metadata: JJ<"package-metadata=">, HelpText<"Emit package metadata note">; // Aliases +def: Separate<["-"], "dT">, Alias, HelpText<"Alias for --default-script">; def: Separate<["-"], "f">, Alias, HelpText<"Alias for --auxiliary">; def: F<"call_shared">, Alias, HelpText<"Alias for --Bdynamic">; def: F<"dy">, Alias, HelpText<"Alias for --Bdynamic">; diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index ba8ce8f784759a..3861120915e8bc 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -176,6 +176,10 @@ is specified, print to the map file. Generate a merged .Li .debug_names section. +.It Fl -default-script Ns = Ns Ar file , Fl dT Ar file +In the absence of +.Fl -script , +read this default linker script. .It Fl -defsym Ns = Ns Ar symbol Ns = Ns Ar expression Define a symbol alias. .Ar expression diff --git a/lld/test/ELF/linkerscript/default-script.s b/lld/test/ELF/linkerscript/default-script.s new file mode 100644 index 00000000000000..bb716a5fe0cddf --- /dev/null +++ b/lld/test/ELF/linkerscript/default-script.s @@ -0,0 +1,63 @@ +# REQUIRES: x86 +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o +# RUN: ld.lld --default-script=def.t b.t -T a.t a.o -o out +# RUN: llvm-readelf -Ss out | FileCheck %s + +# CHECK: Name +# CHECK: .foo2 +# CHECK-NEXT: .foo0 +# CHECK-NEXT: .foo1 +# CHECK: 1: 000000000000000c 0 NOTYPE GLOBAL DEFAULT 4 _start +# CHECK-NEXT: 2: 000000000000002a 0 NOTYPE GLOBAL DEFAULT ABS b +# CHECK-NEXT: 3: 000000000000002a 0 NOTYPE GLOBAL DEFAULT ABS a +# CHECK-EMPTY: + +## In the absence of --script options, the default linker script is read. +# RUN: ld.lld --default-script def.t b.t a.o -o out1 +# RUN: llvm-readelf -Ss out1 | FileCheck %s --check-prefix=CHECK1 +# RUN: ld.lld -dT def.t b.t a.o -o out1a && cmp out1 out1a +## If multiple -dT options are specified, the last -dT wins. +# RUN: ld.lld -dT a.t -dT def.t b.t a.o -o out1a && cmp out1 out1a + +# RUN: mkdir d && cp def.t d/default.t +# RUN: ld.lld -L d -dT default.t b.t a.o -o out1a && cmp out1 out1a + +# CHECK1: Name +# CHECK1: .foo2 +# CHECK1-NEXT: .foo1 +# CHECK1-NEXT: .foo0 +# CHECK1: 1: 000000000000000c 0 NOTYPE GLOBAL DEFAULT 4 _start +# CHECK1-NEXT: 2: 000000000000002a 0 NOTYPE GLOBAL DEFAULT ABS b +# CHECK1-NEXT: 3: 000000000000002a 0 NOTYPE GLOBAL DEFAULT ABS def +# CHECK1-EMPTY: + +# RUN: not ld.lld --default-script not-exist.t b.t -T a.t a.o 2>&1 | FileCheck %s --check-prefix=ERR +# ERR: error: cannot find linker script not-exist.t + +#--- a.s +.globl _start +_start: + +.section .foo0,"a"; .long 0 +.section .foo1,"a"; .long 0 +.section .foo2,"a"; .long 0 + +#--- a.t +a = 42; +SECTIONS { + .foo2 : {} + .foo0 : {} + .foo1 : {} +} + +#--- b.t +b = 42; + +#--- def.t +def = 42; +SECTIONS { + .foo2 : {} + .foo1 : {} + .foo0 : {} +} diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s index dd5d366ae23448..3cbba687169720 100644 --- a/lld/test/ELF/pack-dyn-relocs.s +++ b/lld/test/ELF/pack-dyn-relocs.s @@ -117,79 +117,22 @@ // ANDROID32-NEXT: } // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a32.o %t.a32.so -o %t4.a32 -// RUN: llvm-readobj -S --dynamic-table %t4.a32 | FileCheck --check-prefix=RELR32-HEADERS %s -// RUN: llvm-readobj -r --raw-relr %t4.a32 | FileCheck --check-prefix=RAW-RELR32 %s -// RUN: llvm-readobj -r %t4.a32 | FileCheck --check-prefix=RELR32 %s - -// RELR32-HEADERS: Index: 1 -// RELR32-HEADERS-NEXT: Name: .dynsym - -// RELR32-HEADERS: Name: .relr.dyn -// RELR32-HEADERS-NEXT: Type: SHT_RELR -// RELR32-HEADERS-NEXT: Flags [ (0x2) -// RELR32-HEADERS-NEXT: SHF_ALLOC (0x2) -// RELR32-HEADERS-NEXT: ] -// RELR32-HEADERS-NEXT: Address: [[ADDR:.*]] -// RELR32-HEADERS-NEXT: Offset: [[ADDR]] -// RELR32-HEADERS-NEXT: Size: 8 -// RELR32-HEADERS-NEXT: Link: 0 -// RELR32-HEADERS-NEXT: Info: 0 -// RELR32-HEADERS-NEXT: AddressAlignment: 4 -// RELR32-HEADERS-NEXT: EntrySize: 4 - -// RELR32-HEADERS: 0x00000024 RELR [[ADDR]] -// RELR32-HEADERS: 0x00000023 RELRSZ 8 (bytes) -// RELR32-HEADERS: 0x00000025 RELRENT 4 (bytes) - -/// SHT_RELR section contains address/bitmap entries -/// encoding the offsets for relative relocation. -// RAW-RELR32: Section ({{.+}}) .relr.dyn { -// RAW-RELR32-NEXT: 0x30284 -// RAW-RELR32-NEXT: 0x7FCFEFF -// RAW-RELR32-NEXT: } +// RUN: llvm-readelf -Sdr %t4.a32 | FileCheck --check-prefix=RELR32 %s -/// Decoded SHT_RELR section is same as UNPACKED, -/// but contains only the relative relocations. -/// Any relative relocations with odd offset stay in SHT_REL. -// RELR32: Section ({{.+}}) .rel.dyn { -// RELR32-NEXT: 0x302F1 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302A4 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x302C8 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x302F5 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x302F9 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x302FD R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x30301 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x30305 R_ARM_ABS32 bar2 -// RELR32-NEXT: 0x302C4 R_ARM_ABS32 zed2 -// RELR32-NEXT: } -// RELR32-NEXT: Section ({{.+}}) .relr.dyn { -// RELR32-NEXT: 0x30284 R_ARM_RELATIVE - -// RELR32-NEXT: 0x30288 R_ARM_RELATIVE - -// RELR32-NEXT: 0x3028C R_ARM_RELATIVE - -// RELR32-NEXT: 0x30290 R_ARM_RELATIVE - -// RELR32-NEXT: 0x30294 R_ARM_RELATIVE - -// RELR32-NEXT: 0x30298 R_ARM_RELATIVE - -// RELR32-NEXT: 0x3029C R_ARM_RELATIVE - -// RELR32-NEXT: 0x302A0 R_ARM_RELATIVE - - -// RELR32-NEXT: 0x302A8 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302AC R_ARM_RELATIVE - -// RELR32-NEXT: 0x302B0 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302B4 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302B8 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302BC R_ARM_RELATIVE - -// RELR32-NEXT: 0x302C0 R_ARM_RELATIVE - - -// RELR32-NEXT: 0x302CC R_ARM_RELATIVE - -// RELR32-NEXT: 0x302D0 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302D4 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302D8 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302DC R_ARM_RELATIVE - -// RELR32-NEXT: 0x302E0 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302E4 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302E8 R_ARM_RELATIVE - -// RELR32-NEXT: 0x302EC R_ARM_RELATIVE - -// RELR32-NEXT: } +// RELR32: Name Type Address Off Size ES Flg Lk Inf Al +// RELR32: .dynstr STRTAB {{.*}} 00 A 0 0 1 +// RELR32-NEXT: .rel.dyn REL {{.*}} 08 A 1 0 4 +// RELR32-NEXT: .relr.dyn RELR {{0*}}[[#%x,RELR:]] {{.*}} 04 A 0 0 4 + +// RELR32: (RELCOUNT) 1 +// RELR32: (RELR) 0x[[#RELR]] +// RELR32-NEXT: (RELRSZ) 8 (bytes) +// RELR32-NEXT: (RELRENT) 4 (bytes) + +// RELR32: Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries: +// RELR32-NEXT: Index: Entry Address Symbolic Address +// RELR32-NEXT: 0000: 00030284 {{.*}} +// RELR32-NEXT: 0001: 07fcfeff {{.*}} // RUN: llvm-mc -filetype=obj -triple=aarch64-unknown-linux %p/Inputs/shared2.s -o %t.a64.so.o // RUN: ld.lld -shared %t.a64.so.o -soname=so -o %t.a64.so @@ -292,7 +235,6 @@ // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a64.o %t.a64.so -o %t4.a64 // RUN: llvm-readelf -Sdr -x .data %t4.a64 | FileCheck --check-prefix=RELR64 %s -// RUN: llvm-readobj -r --raw-relr %t4.a64 | FileCheck --check-prefix=RAW-RELR64 %s // RELR64: Name Type Address Off Size ES Flg Lk Inf Al // RELR64: .dynstr STRTAB {{.*}} 00 A 0 0 1 @@ -305,13 +247,6 @@ // RELR64-NEXT: (RELRSZ) 16 (bytes) // RELR64-NEXT: (RELRENT) 8 (bytes) -/// SHT_RELR section contains address/bitmap entries -/// encoding the offsets for relative relocation. -// RAW-RELR64: Section ({{.+}}) .relr.dyn { -// RAW-RELR64-NEXT: 0x30490 -// RAW-RELR64-NEXT: 0x7FCFEFF -// RAW-RELR64-NEXT: } - /// Decoded SHT_RELR section is same as UNPACKED, /// but contains only the relative relocations. /// Any relative relocations with odd offset stay in SHT_RELA. @@ -328,31 +263,31 @@ // RELR64-NEXT: 0000000000030510 0000000200000101 R_AARCH64_ABS64 0000000000000000 zed2 + 0 // RELR64-EMPTY: // RELR64-NEXT: Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries: -// RELR64-NEXT: Offset Info Type Symbol's Value Symbol's Name -// RELR64-NEXT: 0000000000030490 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030498 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304a0 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304a8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304b0 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304b8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304c0 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304c8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304d8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304e0 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304e8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304f0 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 00000000000304f8 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030500 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030508 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030520 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030528 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030530 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030538 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030540 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030548 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030550 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030558 0000000000000403 R_AARCH64_RELATIVE -// RELR64-NEXT: 0000000000030560 0000000000000403 R_AARCH64_RELATIVE +// RELR64-NEXT: Symbolic Address +// RELR64-NEXT: $d.0{{$}} +// RELR64-NEXT: $d.0 + 0x8 +// RELR64-NEXT: $d.0 + 0x10 +// RELR64-NEXT: $d.0 + 0x18 +// RELR64-NEXT: $d.0 + 0x20 +// RELR64-NEXT: $d.0 + 0x28 +// RELR64-NEXT: $d.0 + 0x30 +// RELR64-NEXT: $d.0 + 0x38 +// RELR64-NEXT: $d.0 + 0x48 +// RELR64-NEXT: $d.0 + 0x50 +// RELR64-NEXT: $d.0 + 0x58 +// RELR64-NEXT: $d.0 + 0x60 +// RELR64-NEXT: $d.0 + 0x68 +// RELR64-NEXT: $d.0 + 0x70 +// RELR64-NEXT: $d.0 + 0x78 +// RELR64-NEXT: $d.0 + 0x90 +// RELR64-NEXT: $d.0 + 0x98 +// RELR64-NEXT: $d.0 + 0xa0 +// RELR64-NEXT: $d.0 + 0xa8 +// RELR64-NEXT: $d.0 + 0xb0 +// RELR64-NEXT: $d.0 + 0xb8 +// RELR64-NEXT: $d.0 + 0xc0 +// RELR64-NEXT: $d.0 + 0xc8 +// RELR64-NEXT: $d.0 + 0xd0 // RELR64-EMPTY: // RELR64-NEXT: Hex dump of section '.data': // RELR64-NEXT: 0x00030490 90040300 00000000 91040300 00000000 . diff --git a/lld/test/ELF/partition-pack-dyn-relocs.s b/lld/test/ELF/partition-pack-dyn-relocs.s index 3727bcc21614d8..8127110e348c55 100644 --- a/lld/test/ELF/partition-pack-dyn-relocs.s +++ b/lld/test/ELF/partition-pack-dyn-relocs.s @@ -29,9 +29,9 @@ // CHECK-EMPTY: // CHECK: Relocation section '.relr.dyn' -// CHECK-NEXT: Offset -// PART0-NEXT: 000000000000[[DATA_SEGMENT]]378 {{.*}} R_X86_64_RELATIVE -// PART1-NEXT: 000000000000[[DATA_SEGMENT]]340 {{.*}} R_X86_64_RELATIVE +// CHECK-NEXT: Address Symbolic Address +// PART0-NEXT: 000000000000[[DATA_SEGMENT]]378 p0{{$}} +// PART1-NEXT: 000000000000[[DATA_SEGMENT]]340 p1{{$}} // CHECK-EMPTY: .section .llvm_sympart,"",@llvm_sympart diff --git a/lld/test/ELF/reproduce.s b/lld/test/ELF/reproduce.s index 314c08b4f7cdd5..8818a9e35f4039 100644 --- a/lld/test/ELF/reproduce.s +++ b/lld/test/ELF/reproduce.s @@ -34,10 +34,11 @@ # RUN: cp dyn dyn2 # RUN: echo > file # RUN: echo > file2 +# RUN: echo > file3 # RUN: echo "_start" > order # RUN: mkdir "sysroot with spaces" # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o 'foo bar' -# RUN: ld.lld --reproduce repro3.tar 'foo bar' -L"foo bar" -Lfile -Tfile2 \ +# RUN: ld.lld --reproduce repro3.tar 'foo bar' -L"foo bar" -Lfile -Tfile2 -dT file3 \ # RUN: --dynamic-list dyn --export-dynamic-symbol-list dyn2 -rpath file --script=file --symbol-ordering-file order \ # RUN: --sysroot "sysroot with spaces" --sysroot="sysroot with spaces" \ # RUN: --version-script ver --dynamic-linker "some unusual/path" -soname 'foo bar' \ @@ -48,6 +49,7 @@ # RSP3-NEXT: -L "[[BASEDIR:.+]]/foo bar" # RSP3-NEXT: -L [[BASEDIR]]/file # RSP3-NEXT: --script [[BASEDIR]]/file2 +# RSP3-NEXT: --default-script [[BASEDIR]]/file3 # RSP3-NEXT: --dynamic-list [[BASEDIR]]/dyn # RSP3-NEXT: --export-dynamic-symbol-list [[BASEDIR]]/dyn2 # RSP3-NEXT: -rpath [[BASEDIR]]/file diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index c1dce4ccbf79c2..b71c531f21633a 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -447,6 +447,8 @@ class CompilerType { bool omit_empty_base_classes, std::vector &child_indexes) const; + CompilerType GetDirectNestedTypeWithName(llvm::StringRef name) const; + /// Return the number of template arguments the type has. /// If expand_pack is true, then variadic argument packs are automatically /// expanded to their supplied arguments. If it is false an argument pack diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h index f647fcbf1636ea..3a927d313b823d 100644 --- a/lldb/include/lldb/Symbol/TypeSystem.h +++ b/lldb/include/lldb/Symbol/TypeSystem.h @@ -28,6 +28,7 @@ #include "lldb/Symbol/CompilerDeclContext.h" #include "lldb/Symbol/Type.h" #include "lldb/lldb-private.h" +#include "lldb/lldb-types.h" class PDBASTParser; @@ -363,6 +364,12 @@ class TypeSystem : public PluginInterface, lldb::opaque_compiler_type_t type, llvm::StringRef name, bool omit_empty_base_classes, std::vector &child_indexes) = 0; + virtual CompilerType + GetDirectNestedTypeWithName(lldb::opaque_compiler_type_t type, + llvm::StringRef name) { + return CompilerType(); + } + virtual bool IsTemplateType(lldb::opaque_compiler_type_t type); virtual size_t GetNumTemplateArguments(lldb::opaque_compiler_type_t type, diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp index cb9bee8733e15d..07df8c52a2a4f0 100644 --- a/lldb/source/Expression/IRExecutionUnit.cpp +++ b/lldb/source/Expression/IRExecutionUnit.cpp @@ -431,7 +431,9 @@ void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr, } m_failed_lookups.clear(); - + ss.PutCString( + "\nHint: The expression tried to call a function that is not present " + "in the target, perhaps because it was optimized out by the compiler."); error.SetErrorString(ss.GetString()); return; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index be0ddb06f82c18..2621f682011b41 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -6997,6 +6997,36 @@ TypeSystemClang::GetIndexOfChildWithName(lldb::opaque_compiler_type_t type, return UINT32_MAX; } +CompilerType +TypeSystemClang::GetDirectNestedTypeWithName(lldb::opaque_compiler_type_t type, + llvm::StringRef name) { + if (!type || name.empty()) + return CompilerType(); + + clang::QualType qual_type = RemoveWrappingTypes(GetCanonicalQualType(type)); + const clang::Type::TypeClass type_class = qual_type->getTypeClass(); + + switch (type_class) { + case clang::Type::Record: { + if (!GetCompleteType(type)) + return CompilerType(); + const clang::RecordType *record_type = + llvm::cast(qual_type.getTypePtr()); + const clang::RecordDecl *record_decl = record_type->getDecl(); + + clang::DeclarationName decl_name(&getASTContext().Idents.get(name)); + for (NamedDecl *decl : record_decl->lookup(decl_name)) { + if (auto *tag_decl = dyn_cast(decl)) + return GetType(getASTContext().getTagDeclType(tag_decl)); + } + break; + } + default: + break; + } + return CompilerType(); +} + bool TypeSystemClang::IsTemplateType(lldb::opaque_compiler_type_t type) { if (!type) return false; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index 05c303baa41640..68b82e9688f12b 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -897,6 +897,9 @@ class TypeSystemClang : public TypeSystem { bool omit_empty_base_classes, std::vector &child_indexes) override; + CompilerType GetDirectNestedTypeWithName(lldb::opaque_compiler_type_t type, + llvm::StringRef name) override; + bool IsTemplateType(lldb::opaque_compiler_type_t type) override; size_t GetNumTemplateArguments(lldb::opaque_compiler_type_t type, diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp index 8e4c3c761f784e..96e74b890d2d90 100644 --- a/lldb/source/Symbol/CompilerType.cpp +++ b/lldb/source/Symbol/CompilerType.cpp @@ -931,6 +931,15 @@ size_t CompilerType::GetIndexOfChildMemberWithName( return 0; } +CompilerType +CompilerType::GetDirectNestedTypeWithName(llvm::StringRef name) const { + if (IsValid() && !name.empty()) { + if (auto type_system_sp = GetTypeSystem()) + return type_system_sp->GetDirectNestedTypeWithName(m_type, name); + } + return CompilerType(); +} + size_t CompilerType::GetNumTemplateArguments(bool expand_pack) const { if (IsValid()) { if (auto type_system_sp = GetTypeSystem()) diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp index 44a24d7178f562..dcfd1238f6d789 100644 --- a/lldb/source/Symbol/Type.cpp +++ b/lldb/source/Symbol/Type.cpp @@ -1176,21 +1176,8 @@ bool TypeImpl::GetDescription(lldb_private::Stream &strm, CompilerType TypeImpl::FindDirectNestedType(llvm::StringRef name) { if (name.empty()) return CompilerType(); - auto type_system = GetTypeSystem(/*prefer_dynamic*/ false); - auto *symbol_file = type_system->GetSymbolFile(); - if (!symbol_file) - return CompilerType(); - auto decl_context = type_system->GetCompilerDeclContextForType(m_static_type); - if (!decl_context.IsValid()) - return CompilerType(); - TypeQuery query(decl_context, ConstString(name), - TypeQueryOptions::e_find_one); - TypeResults results; - symbol_file->FindTypes(query, results); - TypeSP type_sp = results.GetFirstType(); - if (type_sp) - return type_sp->GetFullCompilerType(); - return CompilerType(); + return GetCompilerType(/*prefer_dynamic=*/false) + .GetDirectNestedTypeWithName(name); } bool TypeMemberFunctionImpl::IsValid() { diff --git a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py index 6724bfc8ed78e0..baf06e4c59fbba 100644 --- a/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py +++ b/lldb/test/API/lang/cpp/constructors/TestCppConstructors.py @@ -47,7 +47,7 @@ def test_constructors(self): self.expect( "expr ClassWithDeletedDefaultCtor().value", error=True, - substrs=["Couldn't look up symbols:"], + substrs=["Couldn't look up symbols:", "function", "optimized out"], ) @skipIfWindows # Can't find operator new. diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py index eba5e17355c3f8..09c1dee80ef6c4 100644 --- a/lldb/test/API/python_api/type/TestTypeList.py +++ b/lldb/test/API/python_api/type/TestTypeList.py @@ -18,6 +18,21 @@ def setUp(self): self.source = "main.cpp" self.line = line_number(self.source, "// Break at this line") + def _find_nested_type_in_Task_pointer(self, pointer_type): + self.assertTrue(pointer_type) + self.DebugSBType(pointer_type) + pointer_info_type = pointer_type.template_args[1] + self.assertTrue(pointer_info_type) + self.DebugSBType(pointer_info_type) + + pointer_masks1_type = pointer_info_type.FindDirectNestedType("Masks1") + self.assertTrue(pointer_masks1_type) + self.DebugSBType(pointer_masks1_type) + + pointer_masks2_type = pointer_info_type.FindDirectNestedType("Masks2") + self.assertTrue(pointer_masks2_type) + self.DebugSBType(pointer_masks2_type) + @skipIf(compiler="clang", compiler_version=["<", "17.0"]) def test(self): """Exercise SBType and SBTypeList API.""" @@ -151,22 +166,12 @@ def test(self): invalid_type = task_type.FindDirectNestedType(None) self.assertFalse(invalid_type) - # Check that FindDirectNestedType works with types from AST - pointer = frame0.FindVariable("pointer") - pointer_type = pointer.GetType() - self.assertTrue(pointer_type) - self.DebugSBType(pointer_type) - pointer_info_type = pointer_type.template_args[1] - self.assertTrue(pointer_info_type) - self.DebugSBType(pointer_info_type) - - pointer_masks1_type = pointer_info_type.FindDirectNestedType("Masks1") - self.assertTrue(pointer_masks1_type) - self.DebugSBType(pointer_masks1_type) - - pointer_masks2_type = pointer_info_type.FindDirectNestedType("Masks2") - self.assertTrue(pointer_masks2_type) - self.DebugSBType(pointer_masks2_type) + # Check that FindDirectNestedType works with types from module and + # expression ASTs. + self._find_nested_type_in_Task_pointer(frame0.FindVariable("pointer").GetType()) + self._find_nested_type_in_Task_pointer( + frame0.EvaluateExpression("pointer").GetType() + ) # We'll now get the child member 'id' from 'task_head'. id = task_head.GetChildMemberWithName("id") diff --git a/lldb/unittests/Host/FileSystemTest.cpp b/lldb/unittests/Host/FileSystemTest.cpp index 3b5ee7c8bc2237..58887f6b2467e0 100644 --- a/lldb/unittests/Host/FileSystemTest.cpp +++ b/lldb/unittests/Host/FileSystemTest.cpp @@ -93,7 +93,7 @@ class DummyFileSystem : public vfs::FileSystem { std::map::iterator I; std::string Path; bool isInPath(StringRef S) { - if (Path.size() < S.size() && S.find(Path) == 0) { + if (Path.size() < S.size() && S.starts_with(Path)) { auto LastSep = S.find_last_of('/'); if (LastSep == Path.size() || LastSep == Path.size() - 1) return true; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 5b40e49714069f..4d5cd963e09267 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1376,7 +1376,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { return TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy, - AdjustMask, CostKind, 0, nullptr, {}, Shuffle); + AdjustMask, CostKind, 0, nullptr, Operands, Shuffle); } // Narrowing shuffle - perform shuffle at original wider width and @@ -1385,7 +1385,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { InstructionCost ShuffleCost = TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - VecSrcTy, AdjustMask, CostKind, 0, nullptr, {}, Shuffle); + VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle); SmallVector ExtractMask(Mask.size()); std::iota(ExtractMask.begin(), ExtractMask.end(), 0); diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index e1c41b3b55ccfb..bab7c8868532db 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -690,11 +690,11 @@ inline Value *getArgumentAliasingToReturnedPointer(CallBase *Call, bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing( const CallBase *Call, bool MustPreserveNullness); -/// This method strips off any GEP address adjustments and pointer casts from -/// the specified value, returning the original object being addressed. Note -/// that the returned value has pointer type if the specified value does. If -/// the MaxLookup value is non-zero, it limits the number of instructions to -/// be stripped off. +/// This method strips off any GEP address adjustments, pointer casts +/// or `llvm.threadlocal.address` from the specified value \p V, returning the +/// original object being addressed. Note that the returned value has pointer +/// type if the specified value does. If the \p MaxLookup value is non-zero, it +/// limits the number of instructions to be stripped off. const Value *getUnderlyingObject(const Value *V, unsigned MaxLookup = 6); inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) { // Force const to avoid infinite recursion. diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 4ffa6349871baf..d7ec3c16bec21c 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2631,6 +2631,16 @@ class ShuffleVectorInst : public Instruction { return isInterleaveMask(Mask, Factor, NumInputElts, StartIndexes); } + /// Check if the mask is a DE-interleave mask of the given factor + /// \p Factor like: + /// + static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor, + unsigned &Index); + static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor) { + unsigned Unused; + return isDeInterleaveMaskOfFactor(Mask, Factor, Unused); + } + /// Checks if the shuffle is a bit rotation of the first operand across /// multiple subelements, e.g: /// diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index bdd8465883fcff..11262f265100c8 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -320,6 +320,7 @@ def IIT_I2 : IIT_Int<2, 57>; def IIT_I4 : IIT_Int<4, 58>; def IIT_AARCH64_SVCOUNT : IIT_VT; def IIT_V6 : IIT_Vec<6, 60>; +def IIT_V10 : IIT_Vec<10, 61>; } defvar IIT_all_FixedTypes = !filter(iit, IIT_all, diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h index 39165453de16b0..3265589b7c8dfa 100644 --- a/llvm/include/llvm/IR/MDBuilder.h +++ b/llvm/include/llvm/IR/MDBuilder.h @@ -61,6 +61,14 @@ class MDBuilder { /// Return metadata containing two branch weights. MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight); + /// Return metadata containing two branch weights, with significant bias + /// towards `true` destination. + MDNode *createLikelyBranchWeights(); + + /// Return metadata containing two branch weights, with significant bias + /// towards `false` destination. + MDNode *createUnlikelyBranchWeights(); + /// Return metadata containing a number of branch weights. MDNode *createBranchWeights(ArrayRef Weights); diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index faaec331c823ed..aa6cdf198485b0 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -514,16 +514,17 @@ class RecordWriterTrait { using hash_value_type = uint64_t; using offset_type = uint64_t; - // Pointer to the memprof schema to use for the generator. Unlike the reader - // we must use a default constructor with no params for the writer trait so we - // have a public member which must be initialized by the user. - MemProfSchema *Schema = nullptr; +private: + // Pointer to the memprof schema to use for the generator. + const MemProfSchema *Schema; // The MemProf version to use for the serialization. IndexedVersion Version; +public: // We do not support the default constructor, which does not set Version. RecordWriterTrait() = delete; - RecordWriterTrait(IndexedVersion V) : Version(V) {} + RecordWriterTrait(const MemProfSchema *Schema, IndexedVersion V) + : Schema(Schema), Version(V) {} static hash_value_type ComputeHash(key_type_ref K) { return K; } @@ -651,8 +652,8 @@ class CallStackWriterTrait { EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) { using namespace support; endian::Writer LE(Out, llvm::endianness::little); + // We do not explicitly emit the key length because it is a constant. offset_type N = sizeof(K); - LE.write(N); offset_type M = sizeof(FrameId) * V.size(); LE.write(M); return std::make_pair(N, M); @@ -696,8 +697,8 @@ class CallStackLookupTrait { ReadKeyDataLength(const unsigned char *&D) { using namespace support; - offset_type KeyLen = - endian::readNext(D); + // We do not explicitly read the key length because it is a constant. + offset_type KeyLen = sizeof(external_key_type); offset_type DataLen = endian::readNext(D); return std::make_pair(KeyLen, DataLen); diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h index 93090431cbb69f..ea1f4fc3b85dc8 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -76,6 +76,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { SimplifyQuery SQ; OptimizationRemarkEmitter &ORE; BlockFrequencyInfo *BFI; + BranchProbabilityInfo *BPI; ProfileSummaryInfo *PSI; DomConditionCache DC; @@ -96,13 +97,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner { bool MinimizeSize, AAResults *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - const DataLayout &DL, LoopInfo *LI) + BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, + ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) : TTI(TTI), Builder(Builder), Worklist(Worklist), MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true, /*CanUseUndef*/ true, &DC), - ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} + ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), LI(LI) {} virtual ~InstCombiner() = default; diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp index 527f19b194eeb9..1ceb1b26294183 100644 --- a/llvm/lib/Analysis/GlobalsModRef.cpp +++ b/llvm/lib/Analysis/GlobalsModRef.cpp @@ -344,6 +344,14 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V, if (AnalyzeUsesOfPointer(I, Readers, Writers, OkayStoreDest)) return true; } else if (auto *Call = dyn_cast(I)) { + if (IntrinsicInst *II = dyn_cast(I)) { + if (II->getIntrinsicID() == Intrinsic::threadlocal_address && + V == II->getArgOperand(0)) { + if (AnalyzeUsesOfPointer(II, Readers, Writers)) + return true; + continue; + } + } // Make sure that this is just the function being called, not that it is // passing into the function. if (Call->isDataOperand(&U)) { diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index b87bba7c20e0d6..c038f7eaabbb67 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -6255,6 +6255,10 @@ bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing( return true; case Intrinsic::ptrmask: return !MustPreserveNullness; + case Intrinsic::threadlocal_address: + // The underlying variable changes with thread ID. The Thread ID may change + // at coroutine suspend points. + return !Call->getParent()->getParent()->isPresplitCoroutine(); default: return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 438ac1c3cc6e2c..8989eabbe6df2e 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -200,28 +200,6 @@ FunctionPass *llvm::createInterleavedAccessPass() { return new InterleavedAccess(); } -/// Check if the mask is a DE-interleave mask of the given factor -/// \p Factor like: -/// -static bool isDeInterleaveMaskOfFactor(ArrayRef Mask, unsigned Factor, - unsigned &Index) { - // Check all potential start indices from 0 to (Factor - 1). - for (Index = 0; Index < Factor; Index++) { - unsigned i = 0; - - // Check that elements are in ascending order by Factor. Ignore undef - // elements. - for (; i < Mask.size(); i++) - if (Mask[i] >= 0 && static_cast(Mask[i]) != Index + i * Factor) - break; - - if (i == Mask.size()) - return true; - } - - return false; -} - /// Check if the mask is a DE-interleave mask for an interleaved load. /// /// E.g. DE-interleave masks (Factor = 2) could be: @@ -238,7 +216,7 @@ static bool isDeInterleaveMask(ArrayRef Mask, unsigned &Factor, // Make sure we don't produce a load wider than the input load. if (Mask.size() * Factor > NumLoadElements) return false; - if (isDeInterleaveMaskOfFactor(Mask, Factor, Index)) + if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index)) return true; } @@ -333,8 +311,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( for (auto *Shuffle : Shuffles) { if (Shuffle->getType() != VecTy) return false; - if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor, - Index)) + if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getShuffleMask(), Factor, Index)) return false; assert(Shuffle->getShuffleMask().size() <= NumLoadElements); @@ -343,8 +321,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( for (auto *Shuffle : BinOpShuffles) { if (Shuffle->getType() != VecTy) return false; - if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor, - Index)) + if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getShuffleMask(), Factor, Index)) return false; assert(Shuffle->getShuffleMask().size() <= NumLoadElements); diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 6eeed8b5c3f7de..dafa8e2527f6d9 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -312,8 +312,7 @@ llvm::AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, LaneBitmask UseMask, DefMask; - for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { - const MachineOperand &MO = *O; + for (const MachineOperand &MO : const_mi_bundle_ops(MI)) { if (!MO.isReg() || MO.getReg() != Reg) continue; @@ -339,9 +338,7 @@ PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg, PhysRegInfo PRI = {false, false, false, false, false, false, false, false}; assert(Reg.isPhysical() && "analyzePhysReg not given a physical register!"); - for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { - const MachineOperand &MO = *O; - + for (const MachineOperand &MO : const_mi_bundle_ops(MI)) { if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) { PRI.Clobbered = true; continue; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0ea0c0df7f3726..3cdb801bae7bc0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2831,9 +2831,9 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { // (x - y) + -1 -> add (xor y, -1), x if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && - isAllOnesOrAllOnesSplat(N1)) { - SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1); - return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0)); + isAllOnesOrAllOnesSplat(N1, /*AllowUndefs=*/true)) { + SDValue Not = DAG.getNOT(DL, N0.getOperand(1), VT); + return DAG.getNode(ISD::ADD, DL, VT, Not, N0.getOperand(0)); } if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) @@ -3359,7 +3359,7 @@ SDValue DAGCombiner::visitUADDO_CARRY(SDNode *N) { } /** - * If we are facing some sort of diamond carry propapagtion pattern try to + * If we are facing some sort of diamond carry propagation pattern try to * break it up to generate something like: * (uaddo_carry X, 0, (uaddo_carry A, B, Z):Carry) * @@ -3400,7 +3400,7 @@ static SDValue combineUADDO_CARRYDiamond(DAGCombiner &Combiner, Z = Carry0.getOperand(2); } else if (Carry0.getOpcode() == ISD::UADDO && isOneConstant(Carry0.getOperand(1))) { - EVT VT = Combiner.getSetCCResultType(Carry0.getValueType()); + EVT VT = Carry0->getValueType(1); Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT); } else { // We couldn't find a suitable Z. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 93ce9c22af5525..55f9737bc94dd5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -949,7 +949,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { unsigned NumOps = N->getNumOperands(); assert(NumOps <= 3 && "Too many operands"); if (NumOps == 3) - Ops[2] = N->getOperand(2); + Ops[2] = PromoteTargetBoolean(N->getOperand(2), VT); SDLoc dl(N); SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(VT, SVT), @@ -1867,11 +1867,6 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::FSHL: case ISD::FSHR: Res = PromoteIntOp_FunnelShift(N); break; - case ISD::SADDO_CARRY: - case ISD::SSUBO_CARRY: - case ISD::UADDO_CARRY: - case ISD::USUBO_CARRY: Res = PromoteIntOp_ADDSUBO_CARRY(N, OpNo); break; - case ISD::FRAMEADDR: case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break; @@ -2373,19 +2368,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_ZERO_EXTEND(SDNode *N) { N->getOperand(1), N->getOperand(2)); } -SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo) { - assert(OpNo == 2 && "Don't know how to promote this operand!"); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDValue Carry = N->getOperand(2); - SDLoc DL(N); - - Carry = PromoteTargetBoolean(Carry, LHS.getValueType()); - - return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0); -} - SDValue DAGTypeLegalizer::PromoteIntOp_FIX(SDNode *N) { SDValue Op2 = ZExtPromotedInteger(N->getOperand(2)); return SDValue( diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 919c0d4fd2007c..0483f7c74f91a2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -389,7 +389,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); - SDValue PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); SDValue PromoteIntOp_FIX(SDNode *N); SDValue PromoteIntOp_ExpOp(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index f3441c9df0f8c0..7dbf83b7adeef0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9924,7 +9924,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, assert(VTList.VTs[0].isInteger() && VTList.VTs[1].isInteger() && Ops[0].getValueType() == Ops[1].getValueType() && Ops[0].getValueType() == VTList.VTs[0] && - Ops[2].getValueType().isInteger() && + Ops[2].getValueType() == VTList.VTs[1] && "Binary operator types must match!"); break; case ISD::SMUL_LOHI: diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 10cb6e75ffe69e..e66fe73425e863 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1172,6 +1172,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, OutputTable.push_back(IITDescriptor::getVector(8, IsScalableVector)); DecodeIITType(NextElt, Infos, Info, OutputTable); return; + case IIT_V10: + OutputTable.push_back(IITDescriptor::getVector(10, IsScalableVector)); + DecodeIITType(NextElt, Infos, Info, OutputTable); + return; case IIT_V16: OutputTable.push_back(IITDescriptor::getVector(16, IsScalableVector)); DecodeIITType(NextElt, Infos, Info, OutputTable); diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 6cab3c15124162..eb1c5f445eb8b5 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -636,6 +636,13 @@ void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) { } } + if (auto *TI = dyn_cast(V)) { + if (isa(this)) { + setHasNoSignedWrap(TI->hasNoSignedWrap()); + setHasNoUnsignedWrap(TI->hasNoUnsignedWrap()); + } + } + // Copy the exact flag. if (auto *PE = dyn_cast(V)) if (isa(this)) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index cec02e2ce6338b..d2babc748731ac 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2978,6 +2978,31 @@ bool ShuffleVectorInst::isInterleaveMask( return true; } +/// Check if the mask is a DE-interleave mask of the given factor +/// \p Factor like: +/// +bool ShuffleVectorInst::isDeInterleaveMaskOfFactor(ArrayRef Mask, + unsigned Factor, + unsigned &Index) { + // Check all potential start indices from 0 to (Factor - 1). + for (unsigned Idx = 0; Idx < Factor; Idx++) { + unsigned I = 0; + + // Check that elements are in ascending order by Factor. Ignore undef + // elements. + for (; I < Mask.size(); I++) + if (Mask[I] >= 0 && static_cast(Mask[I]) != Idx + I * Factor) + break; + + if (I == Mask.size()) { + Index = Idx; + return true; + } + } + + return false; +} + /// Try to lower a vector shuffle as a bit rotation. /// /// Look for a repeated rotation pattern in each sub group. diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp index 2490b3012bdc2b..0bf41d7cc7c2cd 100644 --- a/llvm/lib/IR/MDBuilder.cpp +++ b/llvm/lib/IR/MDBuilder.cpp @@ -39,6 +39,16 @@ MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight, return createBranchWeights({TrueWeight, FalseWeight}); } +MDNode *MDBuilder::createLikelyBranchWeights() { + // Value chosen to match UR_NONTAKEN_WEIGHT, see BranchProbabilityInfo.cpp + return createBranchWeights((1U << 20) - 1, 1); +} + +MDNode *MDBuilder::createUnlikelyBranchWeights() { + // Value chosen to match UR_NONTAKEN_WEIGHT, see BranchProbabilityInfo.cpp + return createBranchWeights(1, (1U << 20) - 1); +} + MDNode *MDBuilder::createBranchWeights(ArrayRef Weights) { assert(Weights.size() >= 1 && "Need at least one branch weights!"); diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index b05bad7d59ec72..cefb6af12d0021 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1272,13 +1272,13 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start, MemProfFrameTable.reset(MemProfFrameHashTable::Create( /*Buckets=*/Start + FrameTableOffset, /*Payload=*/Start + FramePayloadOffset, - /*Base=*/Start, memprof::FrameLookupTrait())); + /*Base=*/Start)); if (Version >= memprof::Version2) MemProfCallStackTable.reset(MemProfCallStackHashTable::Create( /*Buckets=*/Start + CallStackTableOffset, /*Payload=*/Start + CallStackPayloadOffset, - /*Base=*/Start, memprof::CallStackLookupTrait())); + /*Base=*/Start)); #ifdef EXPENSIVE_CHECKS // Go through all the records and verify that CSId has been correctly diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 0bc5f5206dc2ff..4a6fc9d64b6900 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -451,8 +451,7 @@ static uint64_t writeMemProfRecords( llvm::MapVector &MemProfRecordData, memprof::MemProfSchema *Schema, memprof::IndexedVersion Version) { - memprof::RecordWriterTrait RecordWriter(Version); - RecordWriter.Schema = Schema; + memprof::RecordWriterTrait RecordWriter(Schema, Version); OnDiskChainedHashTableGenerator RecordTableGenerator; for (auto &[GUID, Record] : MemProfRecordData) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7947d73f9a4dd0..3d1453e3beb9a1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17927,11 +17927,11 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, } else continue; - if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) + if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode())) continue; // Constant ones is always righthand operand of the Add. - if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) + if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode())) continue; if (Sub.getOperand(1) != Add.getOperand(0)) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3bf90778363c6c..be53be782077b7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4954,19 +4954,9 @@ defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (v8i8 V64:$opA)), - (zext (v8i8 V64:$opB))), - (AArch64vashr v8i16:$src, (i32 15))))), - (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), - (zext (extract_high_v16i8 (v16i8 V128:$opB)))), - (AArch64vashr v8i16:$src, (i32 15))))), - (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e80931a03f30b6..700242b88346cb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3827,9 +3827,18 @@ InstructionCost AArch64TTIImpl::getShuffleCost( Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { + // Check for LD3/LD4 instructions, which are represented in llvm IR as + // deinterleaving-shuffle(load). The shuffle cost could potentially be free, + // but we model it with a cost of LT.first so that LD3/LD4 have a higher + // cost than just the load. + if (Args.size() >= 1 && isa(Args[0]) && + (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 3) || + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 4))) + return std::max(1, LT.first / 4); + // Check for ST3/ST4 instructions, which are represented in llvm IR as // store(interleaving-shuffle). The shuffle cost could potentially be free, - // but we model it with a cost of LT.first so that LD3/LD3 have a higher + // but we model it with a cost of LT.first so that ST3/ST4 have a higher // cost than just the store. if (CxtI && CxtI->hasOneUse() && isa(*CxtI->user_begin()) && (ShuffleVectorInst::isInterleaveMask( diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0c706d51cb665f..c4592d43f4f8a8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15996,32 +15996,35 @@ bool unsafeFPAtomicsDisabled(Function *F) { "true"; } +static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { + LLVMContext &Ctx = RMW->getContext(); + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty() + ? "system" + : SSNs[RMW->getSyncScopeID()]; + + return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) + << "Hardware instruction generated for atomic " + << RMW->getOperationName(RMW->getOperation()) + << " operation at memory scope " << MemScope; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) return AtomicExpansionKind::NotAtomic; - auto SSID = RMW->getSyncScopeID(); - - auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) { + auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); - LLVMContext &Ctx = RMW->getFunction()->getContext(); - SmallVector SSNs; - Ctx.getSyncScopeNames(SSNs); - auto MemScope = SSNs[RMW->getSyncScopeID()].empty() - ? "system" - : SSNs[RMW->getSyncScopeID()]; - ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) - << "Hardware instruction generated for atomic " - << RMW->getOperationName(RMW->getOperation()) - << " operation at memory scope " << MemScope - << " due to an unsafe request."; + ORE.emit([=]() { + return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request."; }); return Kind; }; + auto SSID = RMW->getSyncScopeID(); bool HasSystemScope = SSID == SyncScope::System || SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index 25e628e5cbc558..79c359a5755451 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -32,16 +32,12 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { } bool isDynamicLDS(const GlobalVariable &GV) { - // external zero size addrspace(3) without initializer implies cuda/hip extern - // __shared__ the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another. This hits different handling. + // external zero size addrspace(3) without initializer is dynlds. const Module *M = GV.getParent(); const DataLayout &DL = M->getDataLayout(); - if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) return false; - } - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - return GV.hasExternalLinkage() && AllocSize == 0; + return DL.getTypeAllocSize(GV.getValueType()) == 0; } bool isLDSVariableToLower(const GlobalVariable &GV) { diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 455a90269d0258..d926ccdb59e19b 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -83,7 +83,7 @@ class RISCVAsmParser : public MCTargetAsmParser { SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); } - bool isRVE() const { return getSTI().hasFeature(RISCV::FeatureRVE); } + bool isRVE() const { return getSTI().hasFeature(RISCV::FeatureStdExtE); } RISCVTargetStreamer &getTargetStreamer() { assert(getParser().getStreamer().getTargetStreamer() && diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 6aadabdf1bc61a..998b9181efe6ac 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -64,7 +64,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() { static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureRVE); + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); if (RegNo >= 32 || (IsRVE && RegNo >= 16)) return MCDisassembler::Fail; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index 5d9a58babe606c..67c9060b515772 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -40,7 +40,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits, StringRef ABIName) { auto TargetABI = getTargetABI(ABIName); bool IsRV64 = TT.isArch64Bit(); - bool IsRVE = FeatureBits[RISCV::FeatureRVE]; + bool IsRVE = FeatureBits[RISCV::FeatureStdExtE]; if (!ABIName.empty() && TargetABI == ABI_Unknown) { errs() diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 339c0397fa4518..116e5a2ab734c8 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -16,6 +16,10 @@ def FeatureStdExtI : SubtargetFeature<"i", "HasStdExtI", "true", "'I' (Base Integer Instruction Set)">; +def FeatureStdExtE + : SubtargetFeature<"e", "HasStdExtE", "true", + "Implements RV{32,64}E (provides 16 rather than 32 GPRs)">; + def FeatureStdExtZic64b : SubtargetFeature<"zic64b", "HasStdExtZic64b", "true", "'Zic64b' (Cache Block Size Is 64 Bytes)">; @@ -1162,10 +1166,6 @@ def IsRV32 : Predicate<"!Subtarget->is64Bit()">, defvar RV32 = DefaultMode; def RV64 : HwMode<"+64bit", [IsRV64]>; -def FeatureRVE - : SubtargetFeature<"e", "IsRVE", "true", - "Implements RV{32,64}E (provides 16 rather than 32 GPRs)">; - def FeatureRelax : SubtargetFeature<"relax", "EnableLinkerRelax", "true", "Enable Linker relaxation.">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b0deb1d2669952..82339dd2072172 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18951,7 +18951,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments( case CallingConv::RISCV_VectorCall: break; case CallingConv::GHC: - if (Subtarget.isRVE()) + if (Subtarget.hasStdExtE()) report_fatal_error("GHC calling convention is not supported on RVE!"); if (!Subtarget.hasStdExtFOrZfinx() || !Subtarget.hasStdExtDOrZdinx()) report_fatal_error("GHC calling convention requires the (Zfinx/F) and " @@ -19189,7 +19189,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); if (CallConv == CallingConv::GHC) { - if (Subtarget.isRVE()) + if (Subtarget.hasStdExtE()) report_fatal_error("GHC calling convention is not supported on RVE!"); ArgCCInfo.AnalyzeCallOperands(Outs, RISCV::CC_RISCV_GHC); } else diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index f9a557e02bfe1a..a4a5d9e96c271a 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -44,7 +44,7 @@ class RISCVProcessorModel f, list tunef = [], string default_march = ""> - : ProcessorModel { + : ProcessorModel { string DefaultMarch = default_march; } @@ -52,7 +52,7 @@ class RISCVTuneProcessorModel tunef = [], list f = []> - : ProcessorModel; + : ProcessorModel; def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32", NoSchedModel, @@ -66,7 +66,7 @@ def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64", GenericTuneInfo; // Support generic for compatibility with other targets. The triple will be used // to change to the appropriate rv32/rv64 version. -def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo; +def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo; def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32", RocketModel, diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 367a62e830cbf5..6a48848e202201 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -65,10 +65,10 @@ RISCVRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Subtarget.hasStdExtD()) return CSR_XLEN_F64_Interrupt_SaveList; if (Subtarget.hasStdExtF()) - return Subtarget.isRVE() ? CSR_XLEN_F32_Interrupt_RVE_SaveList - : CSR_XLEN_F32_Interrupt_SaveList; - return Subtarget.isRVE() ? CSR_Interrupt_RVE_SaveList - : CSR_Interrupt_SaveList; + return Subtarget.hasStdExtE() ? CSR_XLEN_F32_Interrupt_RVE_SaveList + : CSR_XLEN_F32_Interrupt_SaveList; + return Subtarget.hasStdExtE() ? CSR_Interrupt_RVE_SaveList + : CSR_Interrupt_SaveList; } bool HasVectorCSR = @@ -126,7 +126,7 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, RISCV::DUMMY_REG_PAIR_WITH_X0); // There are only 16 GPRs for RVE. - if (Subtarget.isRVE()) + if (Subtarget.hasStdExtE()) for (MCPhysReg Reg = RISCV::X16; Reg <= RISCV::X31; Reg++) markSuperRegs(Reserved, Reg); @@ -145,7 +145,7 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, RISCV::VCIX_STATE); if (MF.getFunction().getCallingConv() == CallingConv::GRAAL) { - if (Subtarget.isRVE()) + if (Subtarget.hasStdExtE()) report_fatal_error("Graal reserved registers do not exist in RVE"); markSuperRegs(Reserved, RISCV::X23); markSuperRegs(Reserved, RISCV::X27); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 40c3e5f9c6bdab..8395d4b2bf66b5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -167,10 +167,21 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) { unsigned MajorNum = getMetadataUInt(VersionMD, 0, 2); unsigned MinorNum = getMetadataUInt(VersionMD, 1); unsigned RevNum = getMetadataUInt(VersionMD, 2); - MAI.SrcLangVersion = (MajorNum * 100 + MinorNum) * 1000 + RevNum; + // Prevent Major part of OpenCL version to be 0 + MAI.SrcLangVersion = + (std::max(1U, MajorNum) * 100 + MinorNum) * 1000 + RevNum; } else { - MAI.SrcLang = SPIRV::SourceLanguage::Unknown; - MAI.SrcLangVersion = 0; + // If there is no information about OpenCL version we are forced to generate + // OpenCL 1.0 by default for the OpenCL environment to avoid puzzling + // run-times with Unknown/0.0 version output. For a reference, LLVM-SPIRV + // Translator avoids potential issues with run-times in a similar manner. + if (ST->isOpenCLEnv()) { + MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_CPP; + MAI.SrcLangVersion = 100000; + } else { + MAI.SrcLang = SPIRV::SourceLanguage::Unknown; + MAI.SrcLangVersion = 0; + } } if (auto ExtNode = M.getNamedMetadata("opencl.used.extensions")) { diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td index a1e2a92b40ac23..e6b95d32c29fad 100644 --- a/llvm/lib/Target/SystemZ/SystemZFeatures.td +++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td @@ -246,6 +246,11 @@ def FeatureInsertReferenceBitsMultiple : SystemZFeature< "Assume that the insert-reference-bits-multiple facility is installed" >; +def FeatureTestPendingExternalInterruption : SystemZFeature< + "test-pending-external-interruption", "TestPendingExternalInterruption", (all_of FeatureTestPendingExternalInterruption), + "Assume that the test-pending-external-interruption facility is installed" +>; + def Arch12NewFeatures : SystemZFeatureList<[ FeatureMiscellaneousExtensions2, FeatureGuardedStorage, @@ -253,7 +258,8 @@ def Arch12NewFeatures : SystemZFeatureList<[ FeatureMessageSecurityAssist8, FeatureVectorEnhancements1, FeatureVectorPackedDecimal, - FeatureInsertReferenceBitsMultiple + FeatureInsertReferenceBitsMultiple, + FeatureTestPendingExternalInterruption ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 50ecd6e0744147..6ec46bddb5dc2d 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -896,6 +896,19 @@ SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering() RegSpillOffsets[Entry.Reg] = Entry.Offset; } +int SystemZXPLINKFrameLowering::getOrCreateFramePointerSaveIndex( + MachineFunction &MF) const { + SystemZMachineFunctionInfo *ZFI = MF.getInfo(); + int FI = ZFI->getFramePointerSaveIndex(); + if (!FI) { + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + FI = MFFrame.CreateFixedObject(8, 0, false); + MFFrame.setStackID(FI, TargetStackID::NoAlloc); + ZFI->setFramePointerSaveIndex(FI); + } + return FI; +} + // Checks if the function is a potential candidate for being a XPLeaf routine. static bool isXPLeafCandidate(const MachineFunction &MF) { const MachineFrameInfo &MFFrame = MF.getFrameInfo(); @@ -991,6 +1004,9 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( Register HighGPR = 0; int HighOffset = -1; + // Query index of the saved frame pointer. + int FPSI = MFI->getFramePointerSaveIndex(); + for (auto &CS : CSI) { Register Reg = CS.getReg(); int Offset = RegSpillOffsets[Reg]; @@ -1013,7 +1029,10 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( // the bottom of the stack and are not truly part of the "normal" stack // frame. Mark the frame index as NoAlloc to indicate it as such. unsigned RegSize = 8; - int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset); + int FrameIdx = + (FPSI && Offset == 0) + ? FPSI + : MFFrame.CreateFixedSpillStackObject(RegSize, Offset); CS.setFrameIdx(FrameIdx); MFFrame.setStackID(FrameIdx, TargetStackID::NoAlloc); } @@ -1467,15 +1486,16 @@ void SystemZXPLINKFrameLowering::determineFrameLayout( StackSize += Regs->getCallFrameSize(); MFFrame.setStackSize(StackSize); - // We now know the stack size. Create the fixed spill stack objects for the - // register save area now. This has no impact on the stack frame layout, as - // this is already computed. However, it makes sure that all callee saved - // registers have a valid frame index assigned. - const unsigned RegSize = MF.getDataLayout().getPointerSize(); - for (auto &CS : MFFrame.getCalleeSavedInfo()) { - int Offset = RegSpillOffsets[CS.getReg()]; - if (Offset >= 0) - CS.setFrameIdx( - MFFrame.CreateFixedSpillStackObject(RegSize, Offset - StackSize)); + // We now know the stack size. Update the stack objects for the register save + // area now. This has no impact on the stack frame layout, as this is already + // computed. However, it makes sure that all callee saved registers have a + // valid offset assigned. + for (int FrameIdx = MFFrame.getObjectIndexBegin(); FrameIdx != 0; + ++FrameIdx) { + if (MFFrame.getStackID(FrameIdx) == TargetStackID::NoAlloc) { + int64_t SPOffset = MFFrame.getObjectOffset(FrameIdx); + SPOffset -= StackSize; + MFFrame.setObjectOffset(FrameIdx, SPOffset); + } } } diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 03ce8882c4de5d..46131819230702 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -41,6 +41,12 @@ class SystemZFrameLowering : public TargetFrameLowering { } bool hasReservedCallFrame(const MachineFunction &MF) const override; + + // Return the offset of the backchain. + virtual unsigned getBackchainOffset(MachineFunction &MF) const = 0; + + // Get or create the frame index of where the old frame pointer is stored. + virtual int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const = 0; }; class SystemZELFFrameLowering : public SystemZFrameLowering { @@ -86,13 +92,13 @@ class SystemZELFFrameLowering : public SystemZFrameLowering { bool usePackedStack(MachineFunction &MF) const; // Return the offset of the backchain. - unsigned getBackchainOffset(MachineFunction &MF) const { + unsigned getBackchainOffset(MachineFunction &MF) const override { // The back chain is stored topmost with packed-stack. return usePackedStack(MF) ? SystemZMC::ELFCallFrameSize - 8 : 0; } // Get or create the frame index of where the old frame pointer is stored. - int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const; + int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override; }; class SystemZXPLINKFrameLowering : public SystemZFrameLowering { @@ -133,6 +139,15 @@ class SystemZXPLINKFrameLowering : public SystemZFrameLowering { RegScavenger *RS) const override; void determineFrameLayout(MachineFunction &MF) const; + + // Return the offset of the backchain. + unsigned getBackchainOffset(MachineFunction &MF) const override { + // The back chain is always the first element of the frame. + return 0; + } + + // Get or create the frame index of where the old frame pointer is stored. + int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 5c2579f3bf1878..b3f93e334a9bd3 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -3765,7 +3765,7 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - auto *TFL = Subtarget.getFrameLowering(); + auto *TFL = Subtarget.getFrameLowering(); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrSystem.td b/llvm/lib/Target/SystemZ/SystemZInstrSystem.td index 497844bec85d17..1f153cc92bb9c8 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrSystem.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrSystem.td @@ -541,6 +541,10 @@ let hasSideEffects = 1, Defs = [CC] in let hasSideEffects = 1, Defs = [CC] in def TPI : StoreInherentS<"tpi", 0xB236, null_frag, 0>; +// Test pending external interruption. +let hasSideEffects = 1, Defs = [CC], Predicates = [FeatureTestPendingExternalInterruption] in + def TPEI : UnaryRRE<"tpei", 0xB9A1, null_frag, GR64, GR64>; + // Set address limit. let hasSideEffects = 1, Uses = [R1L] in def SAL : SideEffectInherentS<"sal", 0xB237, null_frag>; diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td index 7e6302ae656743..120d4a457ee396 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -1640,7 +1640,7 @@ def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>; def : InstRW<[WLat30, MCD], (instregex "RCHP$")>; def : InstRW<[WLat30, MCD], (instregex "SCHM$")>; def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; -def : InstRW<[WLat30, MCD], (instregex "TPI$")>; +def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td index 89edcf426bd714..acba3a1fd9919e 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -1686,7 +1686,7 @@ def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>; def : InstRW<[WLat30, MCD], (instregex "RCHP$")>; def : InstRW<[WLat30, MCD], (instregex "SCHM$")>; def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; -def : InstRW<[WLat30, MCD], (instregex "TPI$")>; +def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td index 8f6dc3befc1976..dd82b2b9b71e75 100644 --- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -1719,7 +1719,7 @@ def : InstRW<[WLat30, MCD], (instregex "(M|S|ST|T)SCH$")>; def : InstRW<[WLat30, MCD], (instregex "RCHP$")>; def : InstRW<[WLat30, MCD], (instregex "SCHM$")>; def : InstRW<[WLat30, MCD], (instregex "STC(PS|RW)$")>; -def : InstRW<[WLat30, MCD], (instregex "TPI$")>; +def : InstRW<[WLat30, MCD], (instregex "TPE?I$")>; def : InstRW<[WLat30, MCD], (instregex "SAL$")>; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 4c9e78c05dbcac..3cd1e05aa5d18c 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" @@ -1323,25 +1324,21 @@ SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { - // Always expand on Subtargets without vector instructions + // Always expand on Subtargets without vector instructions. if (!ST->hasVector()) return true; - // Always expand for operands that do not fill one vector reg - auto *Type = cast(II->getOperand(0)->getType()); - unsigned NumElts = Type->getNumElements(); - unsigned ScalarSize = Type->getScalarSizeInBits(); - unsigned MaxElts = SystemZ::VectorBits / ScalarSize; - if (NumElts < MaxElts) - return true; - - // Otherwise + // Whether or not to expand is a per-intrinsic decision. switch (II->getIntrinsicID()) { - // Do not expand vector.reduce.add - case Intrinsic::vector_reduce_add: - // Except for i64, since the performance benefit is dubious there - return ScalarSize >= 64; default: return true; + // Do not expand vector.reduce.add... + case Intrinsic::vector_reduce_add: + auto *VType = cast(II->getOperand(0)->getType()); + // ...unless the scalar size is i64 or larger, + // or the operand vector is not full, since the + // performance benefit is dubious in those cases. + return VType->getScalarSizeInBits() >= 64 || + VType->getPrimitiveSizeInBits() < SystemZ::VectorBits; } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bedec0c8974a85..3a51c7c2ca854e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29830,6 +29830,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, if (VT.isVector()) { APInt APIntShiftAmt; bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); + unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { if (IsFSHR) @@ -29858,6 +29859,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt; uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt); + assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift"); + + if (EltSizeInBits == 8 && ShXAmt > 1 && + (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) { + // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG + // bit-select - lower using vXi16 shifts and then perform the bitmask at + // the original vector width to handle cases where we split. + MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt); + APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt); + SDValue ShX = + DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0), + DAG.getShiftAmountConstant(ShXAmt, WideVT, DL)); + SDValue ShY = + DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1), + DAG.getShiftAmountConstant(ShYAmt, WideVT, DL)); + ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX), + DAG.getConstant(MaskX, DL, VT)); + ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY), + DAG.getConstant(MaskY, DL, VT)); + return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); + } + SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0, DAG.getShiftAmountConstant(ShXAmt, VT, DL)); SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1, @@ -29874,7 +29898,6 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; - unsigned NumElts = VT.getVectorNumElements(); MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp index 0f742523f8207c..d903ba409360d6 100644 --- a/llvm/lib/TextAPI/TextStub.cpp +++ b/llvm/lib/TextAPI/TextStub.cpp @@ -276,7 +276,7 @@ namespace yaml { template <> struct MappingTraits { static void mapping(IO &IO, ExportSection &Section) { const auto *Ctx = reinterpret_cast(IO.getContext()); - assert((!Ctx || (Ctx && Ctx->FileKind != FileType::Invalid)) && + assert((!Ctx || Ctx->FileKind != FileType::Invalid) && "File type is not set in YAML context"); IO.mapRequired("archs", Section.Architectures); @@ -298,7 +298,7 @@ template <> struct MappingTraits { template <> struct MappingTraits { static void mapping(IO &IO, UndefinedSection &Section) { const auto *Ctx = reinterpret_cast(IO.getContext()); - assert((!Ctx || (Ctx && Ctx->FileKind != FileType::Invalid)) && + assert((!Ctx || Ctx->FileKind != FileType::Invalid) && "File type is not set in YAML context"); IO.mapRequired("archs", Section.Architectures); diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index 5cc8258a495a6e..91d445dfc4c734 100644 --- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -139,8 +139,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) { } bool CrossDSOCFI::runOnModule(Module &M) { - VeryLikelyWeights = - MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1); + VeryLikelyWeights = MDBuilder(M.getContext()).createLikelyBranchWeights(); if (M.getModuleFlag("Cross-DSO CFI") == nullptr) return false; buildCFICheck(M); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index cf13c91a867704..e7a188e9431db5 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1196,8 +1196,7 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, // function pointer to the devirtualized target. In case of a mismatch, // fall back to indirect call. if (DevirtCheckMode == WPDCheckMode::Fallback) { - MDNode *Weights = - MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1); + MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights(); // Version the indirect call site. If the called value is equal to the // given callee, 'NewInst' will be executed, otherwise the original call // site will be executed. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 9a16ca96db76b6..fc284bc61cce97 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -828,9 +828,10 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add, // More general combining of constants in the wide type. // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C) + // or (zext nneg (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C) Constant *NarrowC; - if (match(Op0, - m_OneUse(m_SExt(m_NSWAddLike(m_Value(X), m_Constant(NarrowC)))))) { + if (match(Op0, m_OneUse(m_SExtLike( + m_NSWAddLike(m_Value(X), m_Constant(NarrowC)))))) { Value *WideC = Builder.CreateSExt(NarrowC, Ty); Value *NewC = Builder.CreateAdd(WideC, Op1C); Value *WideX = Builder.CreateSExt(X, Ty); @@ -844,7 +845,6 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add, Value *WideX = Builder.CreateZExt(X, Ty); return BinaryOperator::CreateAdd(WideX, NewC); } - return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index fdadf729fbd541..f0278f207549fa 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -8045,6 +8045,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { Constant *RHSC; if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) { switch (LHSI->getOpcode()) { + case Instruction::Select: + // fcmp eq (cond ? x : -x), 0 --> fcmp eq x, 0 + if (FCmpInst::isEquality(Pred) && match(RHSC, m_AnyZeroFP()) && + (match(LHSI, + m_Select(m_Value(), m_Value(X), m_FNeg(m_Deferred(X)))) || + match(LHSI, m_Select(m_Value(), m_FNeg(m_Value(X)), m_Deferred(X))))) + return replaceOperand(I, 0, X); + break; case Instruction::PHI: if (Instruction *NV = foldOpIntoPhi(I, cast(LHSI))) return NV; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index f4d559036b31fd..4479afbd09afde 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -67,10 +67,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final bool MinimizeSize, AAResults *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - const DataLayout &DL, LoopInfo *LI) + BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, + ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE, - BFI, PSI, DL, LI) {} + BFI, BPI, PSI, DL, LI) {} virtual ~InstCombinerImpl() = default; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a0333b7db8f7a9..4ed4c36e21e016 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -576,12 +576,16 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) { Value *Y, Value *Z) { InstCombiner::BuilderTy &Builder = IC.Builder; Value *YZ = Builder.CreateAdd(Y, Z); - auto *NewPow = Builder.CreateIntrinsic( + Instruction *NewPow = Builder.CreateIntrinsic( Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I); - return IC.replaceInstUsesWith(I, NewPow); + + return NewPow; }; Value *X, *Y, *Z; + unsigned Opcode = I.getOpcode(); + assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) && + "Unexpected opcode"); // powi(X, Y) * X --> powi(X, Y+1) // X * powi(X, Y) --> powi(X, Y+1) @@ -589,31 +593,50 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) { m_Value(X), m_Value(Y)))), m_Deferred(X)))) { Constant *One = ConstantInt::get(Y->getType(), 1); - if (willNotOverflowSignedAdd(Y, One, I)) - return createPowiExpr(I, *this, X, Y, One); + if (willNotOverflowSignedAdd(Y, One, I)) { + Instruction *NewPow = createPowiExpr(I, *this, X, Y, One); + return replaceInstUsesWith(I, NewPow); + } } // powi(x, y) * powi(x, z) -> powi(x, y + z) Value *Op0 = I.getOperand(0); Value *Op1 = I.getOperand(1); - if (I.isOnlyUserOfAnyOperand() && + if (Opcode == Instruction::FMul && I.isOnlyUserOfAnyOperand() && match(Op0, m_AllowReassoc( m_Intrinsic(m_Value(X), m_Value(Y)))) && match(Op1, m_AllowReassoc(m_Intrinsic(m_Specific(X), m_Value(Z)))) && - Y->getType() == Z->getType()) - return createPowiExpr(I, *this, X, Y, Z); - - // powi(X, Y) / X --> powi(X, Y-1) - // This is legal when (Y - 1) can't wraparound, in which case reassoc and nnan - // are required. - // TODO: Multi-use may be also better off creating Powi(x,y-1) - if (I.hasAllowReassoc() && I.hasNoNaNs() && - match(Op0, m_OneUse(m_AllowReassoc(m_Intrinsic( - m_Specific(Op1), m_Value(Y))))) && - willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) { - Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType()); - return createPowiExpr(I, *this, Op1, Y, NegOne); + Y->getType() == Z->getType()) { + Instruction *NewPow = createPowiExpr(I, *this, X, Y, Z); + return replaceInstUsesWith(I, NewPow); + } + + if (Opcode == Instruction::FDiv && I.hasAllowReassoc() && I.hasNoNaNs()) { + // powi(X, Y) / X --> powi(X, Y-1) + // This is legal when (Y - 1) can't wraparound, in which case reassoc and + // nnan are required. + // TODO: Multi-use may be also better off creating Powi(x,y-1) + if (match(Op0, m_OneUse(m_AllowReassoc(m_Intrinsic( + m_Specific(Op1), m_Value(Y))))) && + willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) { + Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType()); + Instruction *NewPow = createPowiExpr(I, *this, Op1, Y, NegOne); + return replaceInstUsesWith(I, NewPow); + } + + // powi(X, Y) / (X * Z) --> powi(X, Y-1) / Z + // This is legal when (Y - 1) can't wraparound, in which case reassoc and + // nnan are required. + // TODO: Multi-use may be also better off creating Powi(x,y-1) + if (match(Op0, m_OneUse(m_AllowReassoc(m_Intrinsic( + m_Value(X), m_Value(Y))))) && + match(Op1, m_AllowReassoc(m_c_FMul(m_Specific(X), m_Value(Z)))) && + willNotOverflowSignedSub(Y, ConstantInt::get(Y->getType(), 1), I)) { + Constant *NegOne = ConstantInt::getAllOnesValue(Y->getType()); + auto *NewPow = createPowiExpr(I, *this, X, Y, NegOne); + return BinaryOperator::CreateFDivFMF(NewPow, Z, &I); + } } return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index ee9b1afbb86ac0..3ac5c2559ddf94 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1347,9 +1347,13 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) { SI->swapProfMetadata(); break; } - case Instruction::Br: - cast(U)->swapSuccessors(); // swaps prof metadata too + case Instruction::Br: { + BranchInst *BI = cast(U); + BI->swapSuccessors(); // swaps prof metadata too + if (BPI) + BPI->swapSuccEdgesProbabilities(BI->getParent()); break; + } case Instruction::Xor: replaceInstUsesWith(cast(*U), I); // Add to worklist for DCE. @@ -3525,6 +3529,8 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) { if (match(Cond, m_Not(m_Value(X))) && !isa(X)) { // Swap Destinations and condition... BI.swapSuccessors(); + if (BPI) + BPI->swapSuccEdgesProbabilities(BI.getParent()); return replaceOperand(BI, 0, X); } @@ -3538,6 +3544,8 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) { Value *NotX = Builder.CreateNot(X, "not." + X->getName()); Value *Or = Builder.CreateLogicalOr(NotX, Y); BI.swapSuccessors(); + if (BPI) + BPI->swapSuccEdgesProbabilities(BI.getParent()); return replaceOperand(BI, 0, Or); } @@ -3554,6 +3562,8 @@ Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) { auto *Cmp = cast(Cond); Cmp->setPredicate(CmpInst::getInversePredicate(Pred)); BI.swapSuccessors(); + if (BPI) + BPI->swapSuccEdgesProbabilities(BI.getParent()); Worklist.push(Cmp); return &BI; } @@ -5248,7 +5258,8 @@ static bool combineInstructionsOverFunction( Function &F, InstructionWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, LoopInfo *LI, const InstCombineOptions &Opts) { + BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, LoopInfo *LI, + const InstCombineOptions &Opts) { auto &DL = F.getParent()->getDataLayout(); /// Builder - This is an IRBuilder that automatically inserts new @@ -5286,7 +5297,7 @@ static bool combineInstructionsOverFunction( << F.getName() << "\n"); InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT, - ORE, BFI, PSI, DL, LI); + ORE, BFI, BPI, PSI, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; bool MadeChangeInThisIteration = IC.prepareWorklist(F, RPOT); MadeChangeInThisIteration |= IC.run(); @@ -5347,9 +5358,10 @@ PreservedAnalyses InstCombinePass::run(Function &F, MAMProxy.getCachedResult(*F.getParent()); auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult(F) : nullptr; + auto *BPI = AM.getCachedResult(F); if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, - BFI, PSI, LI, Options)) + BFI, BPI, PSI, LI, Options)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -5396,9 +5408,14 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { (PSI && PSI->hasProfileSummary()) ? &getAnalysis().getBFI() : nullptr; + BranchProbabilityInfo *BPI = nullptr; + if (auto *WrapperPass = + getAnalysisIfAvailable()) + BPI = &WrapperPass->getBPI(); return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, - BFI, PSI, LI, InstCombineOptions()); + BFI, BPI, PSI, LI, + InstCombineOptions()); } char InstructionCombiningPass::ID = 0; diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 26fedbfd65dd41..9cc978dc6c16ef 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1817,7 +1817,7 @@ Instruction *AddressSanitizer::genAMDGPUReportBlock(IRBuilder<> &IRB, auto *Trm = SplitBlockAndInsertIfThen(ReportCond, &*IRB.GetInsertPoint(), false, - MDBuilder(*C).createBranchWeights(1, 100000)); + MDBuilder(*C).createUnlikelyBranchWeights()); Trm->getParent()->setName("asan.report"); if (Recover) @@ -1894,7 +1894,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, // We use branch weights for the slow path check, to indicate that the slow // path is rarely taken. This seems to be the case for SPEC benchmarks. Instruction *CheckTerm = SplitBlockAndInsertIfThen( - Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000)); + Cmp, InsertBefore, false, MDBuilder(*C).createUnlikelyBranchWeights()); assert(cast(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); IRB.SetInsertPoint(CheckTerm); diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 9a2ec7618c1a49..cdd891fc89483d 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1229,8 +1229,8 @@ bool DataFlowSanitizer::initializeModule(Module &M) { FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs, /*isVarArg=*/false); - ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); - OriginStoreWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); + ColdCallWeights = MDBuilder(*Ctx).createUnlikelyBranchWeights(); + OriginStoreWeights = MDBuilder(*Ctx).createUnlikelyBranchWeights(); return true; } diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 3890aa8ca6ee60..a35f24447cc39b 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -293,8 +293,8 @@ class HWAddressSanitizer { : M(M), SSI(SSI) { this->Recover = optOr(ClRecover, Recover); this->CompileKernel = optOr(ClEnableKhwasan, CompileKernel); - this->Rng = - ClRandomSkipRate.getNumOccurrences() ? M.createRNG("hwasan") : nullptr; + this->Rng = ClRandomSkipRate.getNumOccurrences() ? M.createRNG(DEBUG_TYPE) + : nullptr; initializeModule(); } @@ -911,7 +911,7 @@ HWAddressSanitizer::insertShadowTagCheck(Value *Ptr, Instruction *InsertBefore, R.TagMismatchTerm = SplitBlockAndInsertIfThen( TagMismatch, InsertBefore, false, - MDBuilder(*C).createBranchWeights(1, 100000), &DTU, LI); + MDBuilder(*C).createUnlikelyBranchWeights(), &DTU, LI); return R; } @@ -952,7 +952,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, IRB.CreateICmpUGT(TCI.MemTag, ConstantInt::get(Int8Ty, 15)); Instruction *CheckFailTerm = SplitBlockAndInsertIfThen( OutOfShortGranuleTagRange, TCI.TagMismatchTerm, !Recover, - MDBuilder(*C).createBranchWeights(1, 100000), &DTU, LI); + MDBuilder(*C).createUnlikelyBranchWeights(), &DTU, LI); IRB.SetInsertPoint(TCI.TagMismatchTerm); Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(TCI.PtrLong, 15), Int8Ty); @@ -960,7 +960,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1)); Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, TCI.MemTag); SplitBlockAndInsertIfThen(PtrLowBitsOOB, TCI.TagMismatchTerm, false, - MDBuilder(*C).createBranchWeights(1, 100000), &DTU, + MDBuilder(*C).createUnlikelyBranchWeights(), &DTU, LI, CheckFailTerm->getParent()); IRB.SetInsertPoint(TCI.TagMismatchTerm); @@ -969,7 +969,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr); Value *InlineTagMismatch = IRB.CreateICmpNE(TCI.PtrTag, InlineTag); SplitBlockAndInsertIfThen(InlineTagMismatch, TCI.TagMismatchTerm, false, - MDBuilder(*C).createBranchWeights(1, 100000), &DTU, + MDBuilder(*C).createUnlikelyBranchWeights(), &DTU, LI, CheckFailTerm->getParent()); IRB.SetInsertPoint(CheckFailTerm); diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp index b22e7f7fc0bee0..28dc1c02b661ac 100644 --- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp +++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp @@ -71,8 +71,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) { "compatible with -fsanitize=kcfi on this target")); IntegerType *Int32Ty = Type::getInt32Ty(Ctx); - MDNode *VeryUnlikelyWeights = - MDBuilder(Ctx).createBranchWeights(1, (1U << 20) - 1); + MDNode *VeryUnlikelyWeights = MDBuilder(Ctx).createUnlikelyBranchWeights(); Triple T(M.getTargetTriple()); for (CallInst *CI : KCFICalls) { diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index ee3531bbd68df3..1d7d0356ea6956 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1040,8 +1040,8 @@ void MemorySanitizer::initializeModule(Module &M) { OriginTy = IRB.getInt32Ty(); PtrTy = IRB.getPtrTy(); - ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000); - OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000); + ColdCallWeights = MDBuilder(*C).createUnlikelyBranchWeights(); + OriginStoreWeights = MDBuilder(*C).createUnlikelyBranchWeights(); if (!CompileKernel) { if (TrackOrigins) diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 17c1c442384220..56d4907ae47a98 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" @@ -88,15 +89,14 @@ static cl::opt ClCoverageLevel( "sanitizer-coverage-level", cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, " "3: all blocks and critical edges"), - cl::Hidden, cl::init(0)); + cl::Hidden); static cl::opt ClTracePC("sanitizer-coverage-trace-pc", - cl::desc("Experimental pc tracing"), cl::Hidden, - cl::init(false)); + cl::desc("Experimental pc tracing"), cl::Hidden); static cl::opt ClTracePCGuard("sanitizer-coverage-trace-pc-guard", cl::desc("pc tracing with a guard"), - cl::Hidden, cl::init(false)); + cl::Hidden); // If true, we create a global variable that contains PCs of all instrumented // BBs, put this global into a named section, and pass this section's bounds @@ -106,38 +106,38 @@ static cl::opt ClTracePCGuard("sanitizer-coverage-trace-pc-guard", // inline-bool-flag. static cl::opt ClCreatePCTable("sanitizer-coverage-pc-table", cl::desc("create a static PC table"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters", cl::desc("increments 8-bit counter for every edge"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag", - cl::desc("sets a boolean flag for every edge"), cl::Hidden, - cl::init(false)); + cl::desc("sets a boolean flag for every edge"), + cl::Hidden); static cl::opt ClCMPTracing("sanitizer-coverage-trace-compares", cl::desc("Tracing of CMP and similar instructions"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClDIVTracing("sanitizer-coverage-trace-divs", cl::desc("Tracing of DIV instructions"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClLoadTracing("sanitizer-coverage-trace-loads", cl::desc("Tracing of load instructions"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClStoreTracing("sanitizer-coverage-trace-stores", cl::desc("Tracing of store instructions"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClGEPTracing("sanitizer-coverage-trace-geps", cl::desc("Tracing of GEP instructions"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClPruneBlocks("sanitizer-coverage-prune-blocks", @@ -146,12 +146,11 @@ static cl::opt static cl::opt ClStackDepth("sanitizer-coverage-stack-depth", cl::desc("max stack depth tracing"), - cl::Hidden, cl::init(false)); + cl::Hidden); static cl::opt ClCollectCF("sanitizer-coverage-control-flow", - cl::desc("collect control flow for each function"), cl::Hidden, - cl::init(false)); + cl::desc("collect control flow for each function"), cl::Hidden); namespace { @@ -203,25 +202,25 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { return Options; } -using DomTreeCallback = function_ref; -using PostDomTreeCallback = - function_ref; - class ModuleSanitizerCoverage { public: - ModuleSanitizerCoverage( - const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(), - const SpecialCaseList *Allowlist = nullptr, - const SpecialCaseList *Blocklist = nullptr) - : Options(OverrideFromCL(Options)), Allowlist(Allowlist), - Blocklist(Blocklist) {} - bool instrumentModule(Module &M, DomTreeCallback DTCallback, - PostDomTreeCallback PDTCallback); + using DomTreeCallback = function_ref; + using PostDomTreeCallback = + function_ref; + + ModuleSanitizerCoverage(Module &M, DomTreeCallback DTCallback, + PostDomTreeCallback PDTCallback, + const SanitizerCoverageOptions &Options, + const SpecialCaseList *Allowlist, + const SpecialCaseList *Blocklist) + : M(M), DTCallback(DTCallback), PDTCallback(PDTCallback), + Options(Options), Allowlist(Allowlist), Blocklist(Blocklist) {} + + bool instrumentModule(); private: void createFunctionControlFlow(Function &F); - void instrumentFunction(Function &F, DomTreeCallback DTCallback, - PostDomTreeCallback PDTCallback); + void instrumentFunction(Function &F); void InjectCoverageForIndirectCalls(Function &F, ArrayRef IndirCalls); void InjectTraceForCmp(Function &F, ArrayRef CmpTraceTargets); @@ -251,6 +250,11 @@ class ModuleSanitizerCoverage { std::string getSectionName(const std::string &Section) const; std::string getSectionStart(const std::string &Section) const; std::string getSectionEnd(const std::string &Section) const; + + Module &M; + DomTreeCallback DTCallback; + PostDomTreeCallback PDTCallback; + FunctionCallee SanCovTracePCIndir; FunctionCallee SanCovTracePC, SanCovTracePCGuard; std::array SanCovTraceCmpFunction; @@ -285,16 +289,17 @@ class ModuleSanitizerCoverage { PreservedAnalyses SanitizerCoveragePass::run(Module &M, ModuleAnalysisManager &MAM) { - ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(), - Blocklist.get()); auto &FAM = MAM.getResult(M).getManager(); - auto DTCallback = [&FAM](Function &F) -> const DominatorTree * { - return &FAM.getResult(F); + auto DTCallback = [&FAM](Function &F) -> const DominatorTree & { + return FAM.getResult(F); }; - auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * { - return &FAM.getResult(F); + auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree & { + return FAM.getResult(F); }; - if (!ModuleSancov.instrumentModule(M, DTCallback, PDTCallback)) + ModuleSanitizerCoverage ModuleSancov(M, DTCallback, PDTCallback, + OverrideFromCL(Options), Allowlist.get(), + Blocklist.get()); + if (!ModuleSancov.instrumentModule()) return PreservedAnalyses::all(); PreservedAnalyses PA = PreservedAnalyses::none(); @@ -365,8 +370,7 @@ Function *ModuleSanitizerCoverage::CreateInitCallsForSections( return CtorFunc; } -bool ModuleSanitizerCoverage::instrumentModule( - Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { +bool ModuleSanitizerCoverage::instrumentModule() { if (Options.CoverageType == SanitizerCoverageOptions::SCK_None) return false; if (Allowlist && @@ -479,7 +483,7 @@ bool ModuleSanitizerCoverage::instrumentModule( M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy); for (auto &F : M) - instrumentFunction(F, DTCallback, PDTCallback); + instrumentFunction(F); Function *Ctor = nullptr; @@ -518,29 +522,29 @@ bool ModuleSanitizerCoverage::instrumentModule( } // True if block has successors and it dominates all of them. -static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) { +static bool isFullDominator(const BasicBlock *BB, const DominatorTree &DT) { if (succ_empty(BB)) return false; return llvm::all_of(successors(BB), [&](const BasicBlock *SUCC) { - return DT->dominates(BB, SUCC); + return DT.dominates(BB, SUCC); }); } // True if block has predecessors and it postdominates all of them. static bool isFullPostDominator(const BasicBlock *BB, - const PostDominatorTree *PDT) { + const PostDominatorTree &PDT) { if (pred_empty(BB)) return false; return llvm::all_of(predecessors(BB), [&](const BasicBlock *PRED) { - return PDT->dominates(BB, PRED); + return PDT.dominates(BB, PRED); }); } static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB, - const DominatorTree *DT, - const PostDominatorTree *PDT, + const DominatorTree &DT, + const PostDominatorTree &PDT, const SanitizerCoverageOptions &Options) { // Don't insert coverage for blocks containing nothing but unreachable: we // will never call __sanitizer_cov() for them, so counting them in @@ -568,17 +572,16 @@ static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB, && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor()); } - // Returns true iff From->To is a backedge. // A twist here is that we treat From->To as a backedge if // * To dominates From or // * To->UniqueSuccessor dominates From static bool IsBackEdge(BasicBlock *From, BasicBlock *To, - const DominatorTree *DT) { - if (DT->dominates(To, From)) + const DominatorTree &DT) { + if (DT.dominates(To, From)) return true; if (auto Next = To->getUniqueSuccessor()) - if (DT->dominates(Next, From)) + if (DT.dominates(Next, From)) return true; return false; } @@ -588,7 +591,7 @@ static bool IsBackEdge(BasicBlock *From, BasicBlock *To, // // Note that Cmp pruning is controlled by the same flag as the // BB pruning. -static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT, +static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree &DT, const SanitizerCoverageOptions &Options) { if (!Options.NoPrune) if (CMP->hasOneUse()) @@ -599,8 +602,7 @@ static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT, return true; } -void ModuleSanitizerCoverage::instrumentFunction( - Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { +void ModuleSanitizerCoverage::instrumentFunction(Function &F) { if (F.empty()) return; if (F.getName().contains(".module_ctor")) @@ -629,8 +631,10 @@ void ModuleSanitizerCoverage::instrumentFunction( return; if (F.hasFnAttribute(Attribute::NoSanitizeCoverage)) return; - if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) - SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests()); + if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) { + SplitAllCriticalEdges( + F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests()); + } SmallVector IndirCalls; SmallVector BlocksToInstrument; SmallVector CmpTraceTargets; @@ -640,8 +644,8 @@ void ModuleSanitizerCoverage::instrumentFunction( SmallVector Loads; SmallVector Stores; - const DominatorTree *DT = DTCallback(F); - const PostDominatorTree *PDT = PDTCallback(F); + const DominatorTree &DT = DTCallback(F); + const PostDominatorTree &PDT = PDTCallback(F); bool IsLeafFunc = true; for (auto &BB : F) { @@ -979,8 +983,9 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, FunctionBoolArray->getValueType(), FunctionBoolArray, {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)}); auto Load = IRB.CreateLoad(Int1Ty, FlagPtr); - auto ThenTerm = - SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false); + auto ThenTerm = SplitBlockAndInsertIfThen( + IRB.CreateIsNull(Load), &*IP, false, + MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); IRBuilder<> ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr); Load->setNoSanitizeMetadata(); @@ -997,7 +1002,9 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy); auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack); auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack); - auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false); + auto ThenTerm = SplitBlockAndInsertIfThen( + IsStackLower, &*IP, false, + MDBuilder(IRB.getContext()).createUnlikelyBranchWeights()); IRBuilder<> ThenIRB(ThenTerm); auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack); LowestStack->setNoSanitizeMetadata(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9db9acdcc6f9e6..33c4decd58a6c2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9324,52 +9324,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } -/// Creates either vp_store or vp_scatter intrinsics calls to represent -/// predicated store/scatter. -static Instruction * -lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, - Value *StoredVal, bool IsScatter, Value *Mask, - Value *EVL, const Align &Alignment) { - CallInst *Call; - if (IsScatter) { - Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), - Intrinsic::vp_scatter, - {StoredVal, Addr, Mask, EVL}); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( - Instruction::Store, Type::getVoidTy(EVL->getContext()), - {StoredVal, Addr})); - } - Call->addParamAttr( - 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); - return Call; -} - -/// Creates either vp_load or vp_gather intrinsics calls to represent -/// predicated load/gather. -static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, - VectorType *DataTy, - Value *Addr, bool IsGather, - Value *Mask, Value *EVL, - const Align &Alignment) { - CallInst *Call; - if (IsGather) { - Call = - Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, - nullptr, "wide.masked.gather"); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - Call = cast(VBuilder.createVectorInstruction( - Instruction::Load, DataTy, Addr, "vp.op.load")); - } - Call->addParamAttr( - 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); - return Call; -} - void VPWidenLoadRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); @@ -9391,48 +9345,62 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { Mask = Builder.CreateVectorReverse(Mask, "reverse"); } - // TODO: split this into several classes for better design. - if (State.EVL) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - assert(cast(State.EVL)->getOpcode() == - VPInstruction::ExplicitVectorLength && - "EVL must be VPInstruction::ExplicitVectorLength."); - Value *EVL = State.get(State.EVL, VPIteration(0, 0)); - // If EVL is not nullptr, then EVL must be a valid value set during plan - // creation, possibly default value = whole vector register length. EVL - // is created only if TTI prefers predicated vectorization, thus if EVL - // is not nullptr it also implies preference for predicated - // vectorization. - // FIXME: Support reverse loading after vp_reverse is added. - NewLI = lowerLoadUsingVectorIntrinsics( - Builder, DataTy, State.get(getAddr(), Part, !CreateGather), - CreateGather, Mask, EVL, Alignment); - } else if (CreateGather) { - Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask, - nullptr, "wide.masked.gather"); - State.addMetadata(NewLI, LI); + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateGather); + if (CreateGather) { + NewLI = Builder.CreateMaskedGather(DataTy, Addr, Alignment, Mask, nullptr, + "wide.masked.gather"); + } else if (Mask) { + NewLI = Builder.CreateMaskedLoad(DataTy, Addr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); } else { - auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (Mask) - NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask, - PoisonValue::get(DataTy), - "wide.masked.load"); - else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); - - // Add metadata to the load, but setVectorValue to the reverse shuffle. - State.addMetadata(NewLI, LI); - if (Reverse) - NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); + NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load"); } - + // Add metadata to the load, but setVectorValue to the reverse shuffle. + State.addMetadata(NewLI, LI); + if (Reverse) + NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); State.set(this, NewLI, Part); } } +void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + // FIXME: Support reverse loading after vp_reverse is added. + assert(!isReverse() && "Reverse loads are not implemented yet."); + + auto *LI = cast(&Ingredient); + + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool CreateGather = !isConsecutive(); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + CallInst *NewLI; + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + Value *Addr = State.get(getAddr(), 0, !CreateGather); + Value *Mask = getMask() + ? State.get(getMask(), 0) + : Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + if (CreateGather) { + NewLI = + Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, + nullptr, "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + State.addMetadata(NewLI, LI); + State.set(this, NewLI, 0); +} + void VPWidenStoreRecipe::execute(VPTransformState &State) { auto *SI = cast(&Ingredient); @@ -9456,45 +9424,62 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { Value *StoredVal = State.get(StoredVPValue, Part); if (isReverse()) { - assert(!State.EVL && "reversing not yet implemented with EVL"); // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - // TODO: split this into several classes for better design. - if (State.EVL) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - assert(cast(State.EVL)->getOpcode() == - VPInstruction::ExplicitVectorLength && - "EVL must be VPInstruction::ExplicitVectorLength."); - Value *EVL = State.get(State.EVL, VPIteration(0, 0)); - // If EVL is not nullptr, then EVL must be a valid value set during plan - // creation, possibly default value = whole vector register length. EVL - // is created only if TTI prefers predicated vectorization, thus if EVL - // is not nullptr it also implies preference for predicated - // vectorization. - // FIXME: Support reverse store after vp_reverse is added. - NewSI = lowerStoreUsingVectorIntrinsics( - Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal, - CreateScatter, Mask, EVL, Alignment); - } else if (CreateScatter) { - Value *VectorGep = State.get(getAddr(), Part); - NewSI = - Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask); - } else { - auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (Mask) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } + Value *Addr = State.get(getAddr(), Part, /*IsScalar*/ !CreateScatter); + if (CreateScatter) + NewSI = Builder.CreateMaskedScatter(StoredVal, Addr, Alignment, Mask); + else if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, Addr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, Addr, Alignment); State.addMetadata(NewSI, SI); } } +void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + // FIXME: Support reverse loading after vp_reverse is added. + assert(!isReverse() && "Reverse store are not implemented yet."); + + auto *SI = cast(&Ingredient); + + VPValue *StoredValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + CallInst *NewSI = nullptr; + Value *StoredVal = State.get(StoredValue, 0); + Value *EVL = State.get(getEVL(), VPIteration(0, 0)); + // FIXME: Support reverse store after vp_reverse is added. + Value *Mask = getMask() + ? State.get(getMask(), 0) + : Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *Addr = State.get(getAddr(), 0, !CreateScatter); + if (CreateScatter) { + NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVL}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } + NewSI->addParamAttr( + 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); + State.addMetadata(NewSI, SI); +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 25cb6bb599831a..1b56bb7b600c26 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13139,9 +13139,30 @@ Value *BoUpSLP::vectorizeTree( assert(Vec->getType()->isIntOrIntVectorTy() && PrevVec->getType()->isIntOrIntVectorTy() && "Expected integer vector types only."); - assert(MinBWs.contains(TE->UserTreeIndices.front().UserTE) && - "Expected user in MinBWs."); - bool IsSigned = MinBWs.lookup(TE->UserTreeIndices.front().UserTE).second; + std::optional> Res; + if (const TreeEntry *BaseTE = getTreeEntry(TE->Scalars.front())) { + SmallVector BaseTEs; + if (BaseTE->isSame(TE->Scalars)) + BaseTEs.push_back(BaseTE); + auto It = MultiNodeScalars.find(TE->Scalars.front()); + if (It != MultiNodeScalars.end()) { + for (const TreeEntry *MNTE : It->getSecond()) + if (MNTE->isSame(TE->Scalars)) + BaseTEs.push_back(MNTE); + } + const auto *BaseIt = find_if(BaseTEs, [&](const TreeEntry *BaseTE) { + return MinBWs.contains(BaseTE); + }); + if (BaseIt != BaseTEs.end()) + Res = MinBWs.lookup(*BaseIt); + } + if (!Res) { + assert(MinBWs.contains(TE->UserTreeIndices.front().UserTE) && + "Expected user in MinBWs."); + Res = MinBWs.lookup(TE->UserTreeIndices.front().UserTE); + } + assert(Res && "Expected user node or perfect diamond match in MinBWs."); + bool IsSigned = Res->second; Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), IsSigned); } PrevVec->replaceAllUsesWith(Vec); @@ -14447,7 +14468,8 @@ bool BoUpSLP::collectValuesToDemote( ++BitWidth1; if (auto *I = dyn_cast(V)) { APInt Mask = DB->getDemandedBits(I); - unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); + unsigned BitWidth2 = + std::max(1, Mask.getBitWidth() - Mask.countl_zero()); while (!IsSigned && BitWidth2 < OrigBitWidth) { APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1); if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL))) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 334b10e2e5d097..c74329a0bcc4ac 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -242,15 +242,6 @@ struct VPTransformState { ElementCount VF; unsigned UF; - /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid - /// value set during plan transformation, possibly a default value = whole - /// vector register length. EVL is created only if TTI prefers predicated - /// vectorization, thus if EVL is not nullptr it also implies preference for - /// predicated vectorization. - /// TODO: this is a temporarily solution, the EVL must be explicitly used by - /// the recipes and must be removed here. - VPValue *EVL = nullptr; - /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. @@ -875,7 +866,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { return true; case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: + case VPRecipeBase::VPWidenStoreEVLSC: case VPRecipeBase::VPWidenStoreSC: // TODO: Widened stores don't define a value, but widened loads do. Split // the recipes to be able to make widened loads VPSingleDefRecipes. @@ -2318,11 +2311,15 @@ class VPWidenMemoryRecipe : public VPRecipeBase { } public: - VPWidenMemoryRecipe *clone() override = 0; + VPWidenMemoryRecipe *clone() override { + llvm_unreachable("cloning not supported"); + } static inline bool classof(const VPRecipeBase *R) { - return R->getVPDefID() == VPDef::VPWidenLoadSC || - R->getVPDefID() == VPDef::VPWidenStoreSC; + return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || + R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || + R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; } static inline bool classof(const VPUser *U) { @@ -2390,13 +2387,48 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - // Widened, consecutive loads operations only demand the first lane of // their address. return Op == getAddr() && isConsecutive(); } }; +/// A recipe for widening load operations with vector-predication intrinsics, +/// using the address to load from, the explicit vector length and an optional +/// mask. +struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { + VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(), + {L->getAddr(), EVL}, L->isConsecutive(), false, + L->getDebugLoc()), + VPValue(this, &getIngredient()) { + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenLoadEVLSC) + + /// Return the EVL operand. + VPValue *getEVL() const { return getOperand(1); } + + /// Generate the wide load or gather. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // Widened loads only demand the first lane of EVL and consecutive loads + // only demand the first lane of their address. + return Op == getEVL() || (Op == getAddr() && isConsecutive()); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { @@ -2436,6 +2468,50 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { return Op == getAddr() && isConsecutive() && Op != getStoredValue(); } }; + +/// A recipe for widening store operations with vector-predication intrinsics, +/// using the value to store, the address to store to, the explicit vector +/// length and an optional mask. +struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { + VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(), + {S->getAddr(), S->getStoredValue(), EVL}, + S->isConsecutive(), false, S->getDebugLoc()) { + setMask(Mask); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC) + + /// Return the address accessed by this recipe. + VPValue *getStoredValue() const { return getOperand(1); } + + /// Return the EVL operand. + VPValue *getEVL() const { return getOperand(2); } + + /// Generate the wide store or scatter. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + if (Op == getEVL()) { + assert(getStoredValue() != Op && "unexpected store of EVL"); + return true; + } + // Widened, consecutive memory operations only demand the first lane of + // their address, unless the same operand is also stored. That latter can + // happen with opaque pointers. + return Op == getAddr() && isConsecutive() && Op != getStoredValue(); + } +}; + /// Recipe to expand a SCEV expression. class VPExpandSCEVRecipe : public VPSingleDefRecipe { const SCEV *Expr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 130fb04f586e75..ad4ea648cd6148 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert(isa(R) && + assert((isa(R) || isa(R)) && "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 78932643c81fa3..9ec422ec002c82 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -47,6 +47,7 @@ bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; + case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; case VPReplicateSC: @@ -63,6 +64,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: @@ -81,6 +83,7 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { + case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; case VPReplicateSC: @@ -90,6 +93,7 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBranchOnMaskSC: case VPPredInstPHISC: case VPScalarIVStepsSC: + case VPWidenStoreEVLSC: case VPWidenStoreSC: return false; case VPBlendSC: @@ -155,7 +159,9 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenLoadEVLSC: case VPWidenLoadSC: + case VPWidenStoreEVLSC: case VPWidenStoreSC: assert( cast(this)->getIngredient().mayHaveSideEffects() == @@ -411,8 +417,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); Value *AVL = State.Builder.CreateSub(TripCount, Index); Value *EVL = GetEVL(State, AVL); - assert(!State.EVL && "multiple EVL recipes"); - State.EVL = this; return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { @@ -1778,11 +1782,25 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } +void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = vp.load "; + printOperands(O, SlotTracker); +} + void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN store "; printOperands(O, SlotTracker); } + +void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN vp.store "; + printOperands(O, SlotTracker); +} #endif void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 901ecd10c69d8f..007dc3f89b3fb9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1203,43 +1203,52 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } -/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using -/// the given \p Idiom. -static void -replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom, - function_ref Cond = {}) { +/// Collect all VPValues representing a header mask through the (ICMP_ULE, +/// WideCanonicalIV, backedge-taken-count) pattern. +/// TODO: Introduce explicit recipe for header-mask instead of searching +/// for the header-mask pattern manually. +static SmallVector collectAllHeaderMasks(VPlan &Plan) { + SmallVector WideCanonicalIVs; auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(), [](VPUser *U) { return isa(U); }); - if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end()) - return; - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); - // Walk users of WideCanonicalIV and replace all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with - // the given idiom VPValue. + assert(count_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa(U); }) <= + 1 && + "Must have at most one VPWideCanonicalIVRecipe"); + if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { + auto *WideCanonicalIV = + cast(*FoundWidenCanonicalIVUser); + WideCanonicalIVs.push_back(WideCanonicalIV); + } + + // Also include VPWidenIntOrFpInductionRecipes that represent a widened + // version of the canonical induction. + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *WidenOriginalIV = dyn_cast(&Phi); + if (WidenOriginalIV && WidenOriginalIV->isCanonical()) + WideCanonicalIVs.push_back(WidenOriginalIV); + } + + // Walk users of wide canonical IVs and collect to all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). + SmallVector HeaderMasks; VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (VPUser *U : SmallVector(WideCanonicalIV->users())) { - auto *CompareToReplace = dyn_cast(U); - if (!CompareToReplace || - CompareToReplace->getOpcode() != Instruction::ICmp || - CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || - CompareToReplace->getOperand(1) != BTC) - continue; + for (auto *Wide : WideCanonicalIVs) { + for (VPUser *U : SmallVector(Wide->users())) { + auto *HeaderMask = dyn_cast(U); + if (!HeaderMask || HeaderMask->getOpcode() != Instruction::ICmp || + HeaderMask->getPredicate() != CmpInst::ICMP_ULE || + HeaderMask->getOperand(1) != BTC) + continue; - assert(CompareToReplace->getOperand(0) == WideCanonicalIV && - "WidenCanonicalIV must be the first operand of the compare"); - if (Cond) { - CompareToReplace->replaceUsesWithIf(&Idiom, Cond); - if (!CompareToReplace->getNumUsers()) - CompareToReplace->eraseFromParent(); - } else { - CompareToReplace->replaceAllUsesWith(&Idiom); - CompareToReplace->eraseFromParent(); + assert(HeaderMask->getOperand(0) == Wide && + "WidenCanonicalIV must be the first operand of the compare"); + HeaderMasks.push_back(HeaderMask); } } - if (!WideCanonicalIV->getNumUsers()) - WideCanonicalIV->eraseFromParent(); + return HeaderMasks; } void VPlanTransforms::addActiveLaneMask( @@ -1271,7 +1280,8 @@ void VPlanTransforms::addActiveLaneMask( // Walk users of WideCanonicalIV and replace all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an // active-lane-mask. - replaceHeaderPredicateWith(Plan, *LaneMask); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) + HeaderMask->replaceAllUsesWith(LaneMask); } /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and @@ -1301,17 +1311,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); - // TODO: revisit this and try to remove the mask operand. - // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace - // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count), - // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask. - Value *TrueMask = - ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext()); - VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask); - replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) { - return isa(U); - }); - // Now create the ExplicitVectorLengthPhi recipe in the main loop. + // Create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); EVLPhi->insertAfter(CanonicalIVPHI); auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, @@ -1336,6 +1336,31 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); + for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { + for (VPUser *U : collectUsersRecursively(HeaderMask)) { + auto *MemR = dyn_cast(U); + if (!MemR) + continue; + assert(!MemR->isReverse() && + "Reversed memory operations not supported yet."); + VPValue *OrigMask = MemR->getMask(); + assert(OrigMask && "Unmasked widen memory recipe when folding tail"); + VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; + if (auto *L = dyn_cast(MemR)) { + auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + N->insertBefore(L); + L->replaceAllUsesWith(N); + L->eraseFromParent(); + } else if (auto *S = dyn_cast(MemR)) { + auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + N->insertBefore(S); + S->eraseFromParent(); + } else { + llvm_unreachable("unsupported recipe"); + } + } + recursivelyDeleteDeadRecipes(HeaderMask); + } // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 0bbc7ffb4a2fe0..96d04271850f70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -356,7 +356,9 @@ class VPDef { VPWidenCanonicalIVSC, VPWidenCastSC, VPWidenGEPSC, + VPWidenLoadEVLSC, VPWidenLoadSC, + VPWidenStoreEVLSC, VPWidenStoreSC, VPWidenSC, VPWidenSelectSC, diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll index 106f2f9edc2e43..bf81d7aa5e6893 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll @@ -572,45 +572,45 @@ define void @vld3(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> @@ -639,45 +639,45 @@ define void @vld3(ptr %p) { ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> @@ -780,55 +780,55 @@ define void @vld4(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> @@ -863,55 +863,55 @@ define void @vld4(ptr %p) { ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> diff --git a/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll b/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll index d109b3b8748ba7..4a2c10ca55cdcc 100644 --- a/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll +++ b/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll @@ -22,6 +22,67 @@ entry: ret i32 %v } +@g1_tls = internal thread_local global i32 0 + +define i32 @test1_tls(ptr %param) { +; Ensure that we can fold a store to a load of a global across a store to +; a parameter when the global is non-escaping. +; +; CHECK-LABEL: define i32 @test1_tls( +; CHECK-SAME: ptr [[PARAM:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @g1_tls) +; CHECK-NEXT: store i32 42, ptr [[P]], align 4 +; CHECK-NEXT: store i32 7, ptr [[PARAM]], align 4 +; CHECK-NEXT: ret i32 42 +; +entry: + %p = call ptr @llvm.threadlocal.address(ptr @g1_tls) + store i32 42, ptr %p + store i32 7, ptr %param + %p2 = call ptr @llvm.threadlocal.address(ptr @g1_tls) + %v = load i32, ptr %p2 + ret i32 %v +} + +define ptr @test1_tls_noopt(ptr %coro, ptr %param) presplitcoroutine { +; CHECK-LABEL: define ptr @test1_tls_noopt( +; CHECK-SAME: ptr [[CORO:%.*]], ptr [[PARAM:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @g1_tls) +; CHECK-NEXT: store i32 42, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.coro.suspend(token none, i1 false) +; CHECK-NEXT: switch i8 [[TMP0]], label [[SUSPEND:%.*]] [ +; CHECK-NEXT: i8 0, label [[RESUME:%.*]] +; CHECK-NEXT: i8 1, label [[SUSPEND]] +; CHECK-NEXT: ] +; CHECK: resume: +; CHECK-NEXT: [[P2:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @g1_tls) +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[P2]], align 4 +; CHECK-NEXT: store i32 [[V]], ptr [[PARAM]], align 4 +; CHECK-NEXT: ret ptr [[CORO]] +; CHECK: suspend: +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.coro.end(ptr [[CORO]], i1 false, token none) +; CHECK-NEXT: ret ptr [[CORO]] +; +entry: + %p = call ptr @llvm.threadlocal.address(ptr @g1_tls) + store i32 42, ptr %p + + %0 = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %0, label %suspend [i8 0, label %resume + i8 1, label %suspend] +resume: + %p2 = call ptr @llvm.threadlocal.address(ptr @g1_tls) + %v = load i32, ptr %p2 + store i32 %v, ptr %param, align 4 + ret ptr %coro + +suspend: + call i1 @llvm.coro.end(ptr %coro, i1 0, token none) + ret ptr %coro +} + declare ptr @f() define i32 @test2() { diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index 5b45ba2552cefd..d64327656a9e01 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -1848,3 +1848,51 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) { %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff ret <2 x i128> %absel } + +define <8 x i16> @pr88784(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) { +; CHECK-SD-LABEL: pr88784: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: usubl.8h v0, v0, v1 +; CHECK-SD-NEXT: cmlt.8h v1, v2, #0 +; CHECK-SD-NEXT: ssra.8h v0, v2, #15 +; CHECK-SD-NEXT: eor.16b v0, v1, v0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr88784: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: usubl.8h v0, v0, v1 +; CHECK-GI-NEXT: sshr.8h v1, v2, #15 +; CHECK-GI-NEXT: ssra.8h v0, v2, #15 +; CHECK-GI-NEXT: eor.16b v0, v1, v0 +; CHECK-GI-NEXT: ret + %l4 = zext <8 x i8> %l0 to <8 x i16> + %l5 = ashr <8 x i16> %l2, + %l6 = zext <8 x i8> %l1 to <8 x i16> + %l7 = sub <8 x i16> %l4, %l6 + %l8 = add <8 x i16> %l5, %l7 + %l9 = xor <8 x i16> %l5, %l8 + ret <8 x i16> %l9 +} + +define <8 x i16> @pr88784_fixed(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) { +; CHECK-SD-LABEL: pr88784_fixed: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uabdl.8h v0, v0, v1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr88784_fixed: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: usubl.8h v0, v0, v1 +; CHECK-GI-NEXT: sshr.8h v1, v0, #15 +; CHECK-GI-NEXT: ssra.8h v0, v0, #15 +; CHECK-GI-NEXT: eor.16b v0, v1, v0 +; CHECK-GI-NEXT: ret + %l4 = zext <8 x i8> %l0 to <8 x i16> + %l6 = zext <8 x i8> %l1 to <8 x i16> + %l7 = sub <8 x i16> %l4, %l6 + %l5 = ashr <8 x i16> %l7, + %l8 = add <8 x i16> %l5, %l7 + %l9 = xor <8 x i16> %l5, %l8 + ret <8 x i16> %l9 +} + diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll index 11f67634a3fb2c..23b2622f5f5863 100644 --- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll +++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll @@ -13,6 +13,21 @@ define @bsl( %a, %b) { ret %c } +define @bsl_add_sub( %pre_cond, %left, %right) #0 { +; CHECK-LABEL: bsl_add_sub: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: bsl z1.d, z1.d, z2.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %neg_cond = sub zeroinitializer, %pre_cond + %min_cond = add %pre_cond, splat(i32 -1) + %left_bits_0 = and %neg_cond, %left + %right_bits_0 = and %min_cond, %right + %bsl0000 = or %right_bits_0, %left_bits_0 + ret %bsl0000 +} + ; we are not expecting bsl instruction here. the constants do not match to fold to bsl. define @no_bsl_fold( %a, %b) { ; CHECK-LABEL: no_bsl_fold: diff --git a/llvm/test/CodeGen/AArch64/xor.ll b/llvm/test/CodeGen/AArch64/xor.ll index 7d7f7bfdbf38e8..08557c8cd9c4a7 100644 --- a/llvm/test/CodeGen/AArch64/xor.ll +++ b/llvm/test/CodeGen/AArch64/xor.ll @@ -61,9 +61,8 @@ define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_with_undef(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: vec_add_of_not_with_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %x, %y %r = add <4 x i32> %t0, @@ -73,11 +72,10 @@ define <4 x i32> @vec_add_of_not_with_undef(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_with_undef_decrement(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: vec_add_of_not_with_undef_decrement: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %x, %y - %r = add <4 x i32> %t0, + %r = sub <4 x i32> %t0, ret <4 x i32> %r } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll index ebd6f18de19d8a..376fe79f542e36 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll @@ -35,3 +35,79 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x } declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>) + +define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { + ; GFX940-LABEL: name: flat_agent_atomic_fadd_ret_v2f16 + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.1(0x80000000) + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) + ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.1.atomicrmw.start: + ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 + ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec + ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc + ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2.atomicrmw.end: + ; GFX940-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1 + ; GFX940-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 + ; GFX940-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: $vgpr0 = COPY [[PHI2]] + ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { + ; GFX940-LABEL: name: flat_agent_atomic_fadd_noret_v2f16 + ; GFX940: bb.0 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.1(0x80000000) + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) + ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.1.atomicrmw.start: + ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 + ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec + ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc + ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2.atomicrmw.end: + ; GFX940-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 + ; GFX940-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: SI_RETURN + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 94956511c39dfb..961273468e75ff 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -287,6 +287,72 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f32 ; --------------------------------------------------------------------- @@ -571,6 +637,72 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, ret float %result } +define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) { +; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, float %in) { +; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -855,6 +987,72 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1139,6 +1337,72 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1423,61 +1687,127 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_nand_i32_noret: +define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 -; GCN1-NEXT: v_not_b32_e32 v3, v3 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i32_noret: +; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 -; GCN2-NEXT: v_not_b32_e32 v3, v3 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_nand_i32_noret: +; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1489,7 +1819,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1505,7 +1835,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1517,7 +1847,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1529,7 +1859,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1541,7 +1871,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1551,7 +1881,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1563,7 +1893,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1578,7 +1908,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -1590,7 +1920,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -1601,7 +1931,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -1613,7 +1943,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -1624,7 +1954,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1636,7 +1966,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -1653,7 +1983,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -1665,7 +1995,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1677,7 +2007,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -1689,7 +2019,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1699,7 +2029,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -1711,7 +2041,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -1729,7 +2059,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1741,7 +2071,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1753,7 +2083,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1765,7 +2095,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1777,7 +2107,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1789,7 +2119,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1807,7 +2137,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1819,7 +2149,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1833,7 +2163,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1845,7 +2175,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1857,7 +2187,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1869,7 +2199,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1888,7 +2218,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1900,7 +2230,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1914,7 +2244,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -1926,7 +2256,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1940,7 +2270,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -1952,7 +2282,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1970,7 +2300,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -1982,7 +2312,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1996,7 +2326,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -2008,7 +2338,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2022,7 +2352,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -2034,7 +2364,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2043,6 +2373,157 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: v_not_b32_e32 v3, v3 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: v_not_b32_e32 v3, v3 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: v_not_b32_e32 v0, v0 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: v_not_b32_e32 v0, v0 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: v_not_b32_e32 v3, v3 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2327,84 +2808,150 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_noret: +define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_noret: +; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or v[0:1], v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_xor_i32_noret: +; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_noret_offset: +define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 - %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst - ret void + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } -define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_xor_i32_ret: +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i32_ret: +; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst + ret void +} + +define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i32 4 + %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst + ret void +} + +define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_ret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_ret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2611,6 +3158,72 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ret i32 %result } +define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -2621,7 +3234,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2632,7 +3245,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: s_cbranch_execnz .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2642,7 +3255,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2653,7 +3266,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: s_cbranch_execnz .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2663,7 +3276,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2674,7 +3287,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: s_cbranch_execnz .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2690,7 +3303,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2701,7 +3314,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: s_cbranch_execnz .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2713,7 +3326,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2724,7 +3337,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: s_cbranch_execnz .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2734,7 +3347,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 @@ -2745,7 +3358,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: s_cbranch_execnz .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2760,7 +3373,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -2771,7 +3384,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: s_cbranch_execnz .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -2782,7 +3395,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -2793,7 +3406,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: s_cbranch_execnz .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -2804,7 +3417,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -2815,7 +3428,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: s_cbranch_execnz .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -2832,7 +3445,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -2843,7 +3456,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: s_cbranch_execnz .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2855,7 +3468,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -2866,7 +3479,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: s_cbranch_execnz .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2876,7 +3489,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -2887,7 +3500,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: s_cbranch_execnz .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -2905,7 +3518,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2916,7 +3529,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB68_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2928,7 +3541,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2939,7 +3552,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB68_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2951,7 +3564,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2962,7 +3575,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB68_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2980,7 +3593,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3 @@ -2991,7 +3604,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB69_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3005,7 +3618,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3016,7 +3629,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB69_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3028,7 +3641,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3039,7 +3652,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB69_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3058,7 +3671,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3069,7 +3682,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3083,7 +3696,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3094,7 +3707,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3108,7 +3721,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3119,7 +3732,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3137,7 +3750,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -3148,7 +3761,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3162,7 +3775,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -3173,7 +3786,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3187,7 +3800,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -3198,7 +3811,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3223,7 +3836,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3234,7 +3847,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3253,7 +3866,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3264,7 +3877,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3281,7 +3894,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3292,7 +3905,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3319,7 +3932,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -3330,7 +3943,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3354,7 +3967,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -3365,7 +3978,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3387,7 +4000,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -3398,7 +4011,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3427,7 +4040,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3438,7 +4051,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3455,7 +4068,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3466,7 +4079,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3483,7 +4096,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 @@ -3494,7 +4107,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3518,7 +4131,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -3529,7 +4142,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3551,7 +4164,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -3562,7 +4175,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3584,7 +4197,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -3595,7 +4208,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -3609,20 +4222,18 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { -; GCN1-LABEL: flat_atomic_umax_i32_noret: +define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: v_max_i32_e32 v3, v4, v2 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3630,20 +4241,22 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i32_noret: +; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: v_max_i32_e32 v3, v4, v2 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3651,91 +4264,236 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i32_noret: +; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { -; GCN1-LABEL: flat_atomic_umax_i32_noret_offset: +define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_max_i32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 -; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_max_i32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_max_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + ret void +} + +define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_noret_offset: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -3743,7 +4501,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3758,7 +4516,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -3769,7 +4527,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB78_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -3780,7 +4538,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -3791,7 +4549,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB78_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -3802,7 +4560,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3813,7 +4571,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB78_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -3830,7 +4588,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -3841,7 +4599,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB79_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3853,7 +4611,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -3864,7 +4622,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB79_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3874,7 +4632,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -3885,7 +4643,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB79_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -3903,7 +4661,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3914,7 +4672,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3926,7 +4684,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3937,7 +4695,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3949,7 +4707,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3960,7 +4718,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3978,7 +4736,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3 @@ -3989,7 +4747,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4003,7 +4761,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4014,7 +4772,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4026,7 +4784,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4037,7 +4795,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4056,7 +4814,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4067,7 +4825,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4081,7 +4839,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4092,7 +4850,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4106,7 +4864,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4117,7 +4875,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4135,7 +4893,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4146,7 +4904,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4160,7 +4918,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -4171,7 +4929,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4185,7 +4943,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -4196,7 +4954,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4221,7 +4979,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4232,7 +4990,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4251,7 +5009,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4262,7 +5020,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4279,7 +5037,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 @@ -4290,7 +5048,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4317,7 +5075,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -4328,7 +5086,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4352,7 +5110,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -4363,7 +5121,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4385,7 +5143,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -4396,7 +5154,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -4426,7 +5184,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -4437,7 +5195,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4459,7 +5217,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -4470,7 +5228,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4492,29 +5250,174 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s7 +; GCN3-NEXT: flat_store_dword v[0:1], v2 +; GCN3-NEXT: s_endpgm +entry: + %ptr = getelementptr i32, ptr %out, i32 %index + %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst + store i32 %tmp0, ptr %out2 + ret void +} + +define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_max_u32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_max_u32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v2 -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 -; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_max_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: flat_store_dword v[0:1], v2 -; GCN3-NEXT: s_endpgm -entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst - store i32 %tmp0, ptr %out2 - ret void +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } ; --------------------------------------------------------------------- @@ -4527,7 +5430,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4538,7 +5441,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4548,7 +5451,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4559,7 +5462,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4569,7 +5472,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4580,7 +5483,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4596,7 +5499,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4607,7 +5510,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4619,7 +5522,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4630,7 +5533,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4640,7 +5543,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 @@ -4651,7 +5554,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4666,7 +5569,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -4677,7 +5580,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -4688,7 +5591,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -4699,7 +5602,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -4710,7 +5613,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4721,7 +5624,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -4738,7 +5641,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -4749,7 +5652,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4761,7 +5664,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -4772,7 +5675,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4782,7 +5685,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -4793,7 +5696,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -4811,7 +5714,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4822,7 +5725,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4834,7 +5737,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4845,7 +5748,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4857,7 +5760,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4868,7 +5771,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4886,7 +5789,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4897,7 +5800,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4911,7 +5814,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4922,7 +5825,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4934,7 +5837,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3 @@ -4945,7 +5848,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4964,7 +5867,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -4975,7 +5878,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4989,7 +5892,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5000,7 +5903,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5014,7 +5917,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5025,7 +5928,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5043,7 +5946,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5054,7 +5957,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5068,7 +5971,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5079,7 +5982,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5093,7 +5996,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5104,7 +6007,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5113,6 +6016,151 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ret i32 %result } +define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_min_u32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_min_u32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_min_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw min ; --------------------------------------------------------------------- @@ -5123,7 +6171,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5134,7 +6182,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5144,7 +6192,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5155,7 +6203,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5165,7 +6213,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5176,7 +6224,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5192,7 +6240,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5203,7 +6251,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v4, v3 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5215,7 +6263,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5226,7 +6274,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v4, v3 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5236,7 +6284,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 @@ -5247,7 +6295,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v4, v3 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5262,7 +6310,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 @@ -5273,7 +6321,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v3 @@ -5284,7 +6332,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 @@ -5295,7 +6343,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v3 @@ -5306,7 +6354,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -5317,7 +6365,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -5334,7 +6382,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v0, v[3:4] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 @@ -5345,7 +6393,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5357,7 +6405,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; GCN2-NEXT: flat_load_dword v0, v[3:4] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 @@ -5368,7 +6416,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5378,7 +6426,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 @@ -5389,7 +6437,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 @@ -5407,7 +6455,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5418,7 +6466,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5430,7 +6478,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5441,7 +6489,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5453,7 +6501,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5464,7 +6512,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5482,7 +6530,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: v_mov_b32_e32 v1, s35 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5493,7 +6541,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5507,7 +6555,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: v_mov_b32_e32 v1, s35 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5518,7 +6566,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5530,7 +6578,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 @@ -5541,7 +6589,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5560,7 +6608,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_mov_b32_e32 v1, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v2, s5 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5571,7 +6619,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5585,7 +6633,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_mov_b32_e32 v1, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v2, s5 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5596,7 +6644,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5610,7 +6658,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5621,7 +6669,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5639,7 +6687,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_mov_b32_e32 v2, s35 ; GCN1-NEXT: flat_load_dword v0, v[1:2] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v0 @@ -5650,7 +6698,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5664,7 +6712,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_mov_b32_e32 v2, s35 ; GCN2-NEXT: flat_load_dword v0, v[1:2] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v0 @@ -5675,7 +6723,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5689,7 +6737,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_mov_b32_e32 v1, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v2, s5 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v0 @@ -5700,7 +6748,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5725,7 +6773,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5736,7 +6784,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5755,7 +6803,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5766,7 +6814,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5783,7 +6831,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5794,7 +6842,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -5821,7 +6869,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -5832,7 +6880,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5856,7 +6904,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -5867,7 +6915,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5889,7 +6937,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -5900,7 +6948,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -5925,7 +6973,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: flat_load_dword v3, v[0:1] -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5936,7 +6984,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v3, v2 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -5949,7 +6997,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: flat_load_dword v3, v[0:1] -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5960,7 +7008,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v3, v2 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -5973,7 +7021,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 ; GCN3-NEXT: flat_load_dword v3, v[0:1] -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 @@ -5984,7 +7032,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v3, v2 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6007,7 +7055,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_mov_b32_e32 v1, s1 ; GCN1-NEXT: flat_load_dword v2, v[0:1] ; GCN1-NEXT: s_mov_b64 s[0:1], 0 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v3, v2 @@ -6018,7 +7066,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -6040,7 +7088,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v2, v[0:1] ; GCN2-NEXT: s_mov_b64 s[0:1], 0 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v3, v2 @@ -6051,7 +7099,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -6073,7 +7121,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v2, v[0:1] ; GCN3-NEXT: s_mov_b64 s[0:1], 0 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v3, v2 @@ -6084,7 +7132,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s6 @@ -6098,6 +7146,151 @@ entry: ret void } +define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_min_i32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_min_i32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_min_i32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw uinc_wrap ; --------------------------------------------------------------------- @@ -6382,6 +7575,72 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ret i32 %result } +define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -6665,3 +7924,71 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } + +define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i32 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 7fc4484608c296..4bf3e2fdd0511f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -299,6 +299,72 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } +define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f64 ; --------------------------------------------------------------------- @@ -595,6 +661,72 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ret double %result } +define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) { +; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, double %in) { +; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr double, ptr %out, i64 4 + %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -891,6 +1023,72 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1187,6 +1385,72 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1483,48 +1747,114 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_nand_i64_noret: +define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 -; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 -; GCN1-NEXT: v_not_b32_e32 v5, v4 -; GCN1-NEXT: v_not_b32_e32 v4, v8 -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_nand_i64_noret: +; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: flat_load_dword v7, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v4 +; GCN1-NEXT: v_not_b32_e32 v4, v8 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1539,7 +1869,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_1 +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1549,7 +1879,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1564,7 +1894,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1583,7 +1913,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1598,7 +1928,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1613,7 +1943,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 @@ -1628,7 +1958,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1638,7 +1968,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1653,7 +1983,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1671,7 +2001,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -1686,7 +2016,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -1701,7 +2031,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -1716,7 +2046,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -1728,7 +2058,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -1743,7 +2073,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -1764,7 +2094,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -1779,7 +2109,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1794,7 +2124,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -1809,7 +2139,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1819,7 +2149,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -1834,7 +2164,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -1860,7 +2190,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1875,7 +2205,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1894,7 +2224,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1909,7 +2239,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -1923,7 +2253,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1938,7 +2268,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -1961,7 +2291,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: flat_load_dword v3, v[0:1] ; GCN1-NEXT: flat_load_dword v2, v[4:5] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3 @@ -1976,7 +2306,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -1995,7 +2325,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: flat_load_dword v3, v[0:1] ; GCN2-NEXT: flat_load_dword v2, v[4:5] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2010,7 +2340,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2024,7 +2354,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2039,7 +2369,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2063,7 +2393,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2078,7 +2408,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2097,7 +2427,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2112,7 +2442,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2126,7 +2456,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2141,7 +2471,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2164,7 +2494,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[2:3] ; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v1 @@ -2179,7 +2509,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2198,7 +2528,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[2:3] ; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v1 @@ -2213,7 +2543,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2227,7 +2557,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_mov_b64 s[34:35], 0 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v1 @@ -2242,7 +2572,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2251,6 +2581,188 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } +define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v6, v2 +; GCN1-NEXT: v_not_b32_e32 v5, v0 +; GCN1-NEXT: v_not_b32_e32 v4, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v0, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v6, v2 +; GCN2-NEXT: v_not_b32_e32 v5, v0 +; GCN2-NEXT: v_not_b32_e32 v4, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN1-NEXT: v_not_b32_e32 v7, v0 +; GCN1-NEXT: v_not_b32_e32 v6, v1 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v1, v8, v2 +; GCN2-NEXT: v_not_b32_e32 v7, v0 +; GCN2-NEXT: v_not_b32_e32 v6, v1 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v4, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v8, v6, v2 +; GCN3-NEXT: v_not_b32_e32 v5, v4 +; GCN3-NEXT: v_not_b32_e32 v4, v8 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2547,23 +3059,89 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_xor_i64_noret: +define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_xor_i64_noret: +; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2843,6 +3421,72 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ret i64 %result } +define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -2856,7 +3500,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2870,7 +3514,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: s_cbranch_execnz .LBB80_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2883,7 +3527,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2897,7 +3541,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: s_cbranch_execnz .LBB80_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2907,7 +3551,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2921,7 +3565,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: s_cbranch_execnz .LBB80_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -2940,7 +3584,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2954,7 +3598,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: s_cbranch_execnz .LBB81_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -2969,7 +3613,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -2983,7 +3627,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: s_cbranch_execnz .LBB81_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -2993,7 +3637,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3007,7 +3651,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: s_cbranch_execnz .LBB81_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3025,7 +3669,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -3039,7 +3683,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: s_cbranch_execnz .LBB82_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -3054,7 +3698,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -3068,7 +3712,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: s_cbranch_execnz .LBB82_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -3080,7 +3724,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3094,7 +3738,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: s_cbranch_execnz .LBB82_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -3115,7 +3759,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3129,7 +3773,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: s_cbranch_execnz .LBB83_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3144,7 +3788,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3158,7 +3802,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: s_cbranch_execnz .LBB83_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3168,7 +3812,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -3182,7 +3826,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: s_cbranch_execnz .LBB83_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -3210,7 +3854,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3224,7 +3868,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB68_1 +; GCN1-NEXT: s_cbranch_execnz .LBB84_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3245,7 +3889,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3259,7 +3903,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB68_1 +; GCN2-NEXT: s_cbranch_execnz .LBB84_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3275,7 +3919,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3289,7 +3933,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB68_1 +; GCN3-NEXT: s_cbranch_execnz .LBB84_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3314,7 +3958,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3328,7 +3972,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB69_1 +; GCN1-NEXT: s_cbranch_execnz .LBB85_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3349,7 +3993,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3363,7 +4007,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB69_1 +; GCN2-NEXT: s_cbranch_execnz .LBB85_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3379,7 +4023,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3393,7 +4037,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB69_1 +; GCN3-NEXT: s_cbranch_execnz .LBB85_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3419,7 +4063,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3433,7 +4077,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: s_cbranch_execnz .LBB86_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3454,7 +4098,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3468,7 +4112,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: s_cbranch_execnz .LBB86_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3484,7 +4128,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -3498,7 +4142,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: s_cbranch_execnz .LBB86_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3523,7 +4167,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -3537,7 +4181,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: s_cbranch_execnz .LBB87_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -3558,7 +4202,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -3572,7 +4216,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: s_cbranch_execnz .LBB87_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -3588,7 +4232,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -3602,7 +4246,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: s_cbranch_execnz .LBB87_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -3628,7 +4272,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3642,7 +4286,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: s_cbranch_execnz .LBB88_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3662,7 +4306,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3676,7 +4320,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: s_cbranch_execnz .LBB88_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3694,7 +4338,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3708,7 +4352,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: s_cbranch_execnz .LBB88_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3734,7 +4378,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -3748,7 +4392,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: s_cbranch_execnz .LBB89_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3771,7 +4415,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -3785,7 +4429,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: s_cbranch_execnz .LBB89_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -3806,7 +4450,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -3820,7 +4464,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: s_cbranch_execnz .LBB89_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -3850,7 +4494,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3864,7 +4508,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: s_cbranch_execnz .LBB90_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -3882,7 +4526,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] @@ -3896,7 +4540,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: s_cbranch_execnz .LBB90_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -3914,7 +4558,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3928,7 +4572,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: s_cbranch_execnz .LBB90_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -3951,7 +4595,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -3965,7 +4609,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: s_cbranch_execnz .LBB91_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -3986,7 +4630,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4000,7 +4644,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: s_cbranch_execnz .LBB91_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -4021,7 +4665,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -4035,7 +4679,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: s_cbranch_execnz .LBB91_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -4049,77 +4693,77 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_umax_i64_noret: +define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: s_cbranch_execnz .LBB92_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; -; GCN2-LABEL: flat_atomic_umax_i64_noret: +; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: flat_load_dword v6, v[0:1] -; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: s_cbranch_execnz .LBB92_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; -; GCN3-LABEL: flat_atomic_umax_i64_noret: +; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4127,26 +4771,202 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: s_cbranch_execnz .LBB92_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { -; GCN1-LABEL: flat_atomic_umax_i64_noret_offset: +define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_noret: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_noret: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst + ret void +} + +define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4160,7 +4980,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: s_cbranch_execnz .LBB95_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4175,7 +4995,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4189,7 +5009,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: s_cbranch_execnz .LBB95_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4199,7 +5019,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4213,7 +5033,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: s_cbranch_execnz .LBB95_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4231,7 +5051,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -4245,7 +5065,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB78_1 +; GCN1-NEXT: s_cbranch_execnz .LBB96_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -4260,7 +5080,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -4274,7 +5094,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB78_1 +; GCN2-NEXT: s_cbranch_execnz .LBB96_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -4286,7 +5106,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4300,7 +5120,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB78_1 +; GCN3-NEXT: s_cbranch_execnz .LBB96_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -4321,7 +5141,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4335,7 +5155,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB79_1 +; GCN1-NEXT: s_cbranch_execnz .LBB97_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4350,7 +5170,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4364,7 +5184,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB79_1 +; GCN2-NEXT: s_cbranch_execnz .LBB97_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4374,7 +5194,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -4388,7 +5208,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB79_1 +; GCN3-NEXT: s_cbranch_execnz .LBB97_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -4416,7 +5236,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4430,7 +5250,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB80_1 +; GCN1-NEXT: s_cbranch_execnz .LBB98_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4451,7 +5271,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4465,7 +5285,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB80_1 +; GCN2-NEXT: s_cbranch_execnz .LBB98_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4481,7 +5301,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4495,7 +5315,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB80_1 +; GCN3-NEXT: s_cbranch_execnz .LBB98_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4520,7 +5340,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4534,7 +5354,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB81_1 +; GCN1-NEXT: s_cbranch_execnz .LBB99_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4555,7 +5375,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4569,7 +5389,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB81_1 +; GCN2-NEXT: s_cbranch_execnz .LBB99_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4585,7 +5405,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4599,7 +5419,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB81_1 +; GCN3-NEXT: s_cbranch_execnz .LBB99_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4625,7 +5445,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4639,7 +5459,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB82_1 +; GCN1-NEXT: s_cbranch_execnz .LBB100_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4660,7 +5480,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4674,7 +5494,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB82_1 +; GCN2-NEXT: s_cbranch_execnz .LBB100_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4690,7 +5510,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4704,7 +5524,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB82_1 +; GCN3-NEXT: s_cbranch_execnz .LBB100_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4729,7 +5549,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -4743,7 +5563,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB83_1 +; GCN1-NEXT: s_cbranch_execnz .LBB101_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -4764,7 +5584,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -4778,7 +5598,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB83_1 +; GCN2-NEXT: s_cbranch_execnz .LBB101_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -4794,7 +5614,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -4808,7 +5628,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB83_1 +; GCN3-NEXT: s_cbranch_execnz .LBB101_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -4834,7 +5654,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -4848,7 +5668,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB84_1 +; GCN1-NEXT: s_cbranch_execnz .LBB102_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -4868,7 +5688,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] @@ -4882,7 +5702,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB84_1 +; GCN2-NEXT: s_cbranch_execnz .LBB102_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -4900,7 +5720,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -4914,7 +5734,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB84_1 +; GCN3-NEXT: s_cbranch_execnz .LBB102_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -4940,7 +5760,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -4954,7 +5774,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB85_1 +; GCN1-NEXT: s_cbranch_execnz .LBB103_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -4977,7 +5797,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -4991,7 +5811,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB85_1 +; GCN2-NEXT: s_cbranch_execnz .LBB103_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5012,7 +5832,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5026,7 +5846,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB85_1 +; GCN3-NEXT: s_cbranch_execnz .LBB103_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -5055,7 +5875,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -5069,7 +5889,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB86_1 +; GCN1-NEXT: s_cbranch_execnz .LBB104_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -5090,7 +5910,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -5104,7 +5924,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB86_1 +; GCN2-NEXT: s_cbranch_execnz .LBB104_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -5125,7 +5945,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -5139,7 +5959,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB86_1 +; GCN3-NEXT: s_cbranch_execnz .LBB104_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -5153,6 +5973,182 @@ entry: ret void } +define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5166,7 +6162,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v6, v[0:1] ; GCN1-NEXT: flat_load_dword v7, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5180,7 +6176,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB87_1 +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5193,7 +6189,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5207,7 +6203,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB87_1 +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5217,7 +6213,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5231,7 +6227,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB87_1 +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5250,7 +6246,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5264,7 +6260,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB88_1 +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5279,7 +6275,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5293,7 +6289,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB88_1 +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5303,7 +6299,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5317,7 +6313,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB88_1 +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5335,7 +6331,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -5349,7 +6345,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB89_1 +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -5364,7 +6360,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -5378,7 +6374,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB89_1 +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -5390,7 +6386,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5404,7 +6400,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB89_1 +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -5425,7 +6421,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5439,7 +6435,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB90_1 +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5454,7 +6450,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5468,7 +6464,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB90_1 +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5478,7 +6474,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -5492,7 +6488,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB90_1 +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -5520,7 +6516,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5534,7 +6530,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB91_1 +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5555,7 +6551,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5569,7 +6565,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB91_1 +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5585,7 +6581,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5599,7 +6595,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB91_1 +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5624,7 +6620,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5638,7 +6634,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB92_1 +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5659,7 +6655,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5673,7 +6669,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB92_1 +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5689,7 +6685,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -5703,7 +6699,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB92_1 +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5729,7 +6725,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5743,7 +6739,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB93_1 +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5764,7 +6760,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5778,7 +6774,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB93_1 +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5794,7 +6790,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5808,7 +6804,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB93_1 +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5833,7 +6829,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -5847,7 +6843,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB94_1 +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5868,7 +6864,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -5882,7 +6878,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB94_1 +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5898,7 +6894,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -5912,7 +6908,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB94_1 +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -5921,34 +6917,210 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw min -; --------------------------------------------------------------------- - -define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { -; GCN1-LABEL: flat_atomic_min_i64_noret: +define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: flat_load_dword v6, v[0:1] -; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_noret: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: v_mov_b32_e32 v7, v5 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v4 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB95_1 +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -5961,7 +7133,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v6, v[0:1] ; GCN2-NEXT: flat_load_dword v7, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -5975,7 +7147,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v4 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB95_1 +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -5985,7 +7157,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -5999,7 +7171,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB95_1 +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6018,7 +7190,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v7, v[0:1] ; GCN1-NEXT: flat_load_dword v6, v[8:9] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6032,7 +7204,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v6, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB96_1 +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6047,7 +7219,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v7, v[0:1] ; GCN2-NEXT: flat_load_dword v6, v[8:9] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6061,7 +7233,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v6, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB96_1 +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6071,7 +7243,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6085,7 +7257,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v6, v4 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB96_1 +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6103,7 +7275,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: flat_load_dword v4, v[0:1] ; GCN1-NEXT: flat_load_dword v5, v[5:6] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v7, v5 @@ -6117,7 +7289,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB97_1 +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v0, v4 @@ -6132,7 +7304,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: flat_load_dword v4, v[0:1] ; GCN2-NEXT: flat_load_dword v5, v[5:6] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v7, v5 @@ -6146,7 +7318,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB97_1 +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v0, v4 @@ -6158,7 +7330,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6172,7 +7344,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB97_1 +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -6193,7 +7365,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: flat_load_dword v1, v[0:1] ; GCN1-NEXT: flat_load_dword v0, v[4:5] ; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6207,7 +7379,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB98_1 +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6222,7 +7394,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: flat_load_dword v1, v[0:1] ; GCN2-NEXT: flat_load_dword v0, v[4:5] ; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6236,7 +7408,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB98_1 +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6246,7 +7418,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 ; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v7, v5 @@ -6260,7 +7432,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB98_1 +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v4 @@ -6288,7 +7460,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 ; GCN1-NEXT: v_mov_b32_e32 v5, s5 -; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6302,7 +7474,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB99_1 +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6323,7 +7495,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 ; GCN2-NEXT: v_mov_b32_e32 v5, s5 -; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6337,7 +7509,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB99_1 +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6353,7 +7525,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6367,7 +7539,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB99_1 +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6392,7 +7564,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s7 ; GCN1-NEXT: v_mov_b32_e32 v7, s6 -; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6406,7 +7578,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB100_1 +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6427,7 +7599,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s7 ; GCN2-NEXT: v_mov_b32_e32 v7, s6 -; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6441,7 +7613,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB100_1 +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6457,7 +7629,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 ; GCN3-NEXT: v_mov_b32_e32 v5, s5 -; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6471,7 +7643,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB100_1 +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6497,7 +7669,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6511,7 +7683,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB101_1 +; GCN1-NEXT: s_cbranch_execnz .LBB123_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6532,7 +7704,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6546,7 +7718,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB101_1 +; GCN2-NEXT: s_cbranch_execnz .LBB123_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6562,7 +7734,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6576,7 +7748,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB101_1 +; GCN3-NEXT: s_cbranch_execnz .LBB123_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6601,7 +7773,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_mov_b64 s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v1 @@ -6615,7 +7787,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB102_1 +; GCN1-NEXT: s_cbranch_execnz .LBB124_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] @@ -6636,7 +7808,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_mov_b64 s[34:35], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: v_mov_b32_e32 v5, s6 -; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v1 @@ -6650,7 +7822,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB102_1 +; GCN2-NEXT: s_cbranch_execnz .LBB124_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] @@ -6666,7 +7838,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s7 ; GCN3-NEXT: v_mov_b32_e32 v5, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v1 @@ -6680,7 +7852,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB102_1 +; GCN3-NEXT: s_cbranch_execnz .LBB124_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] @@ -6706,7 +7878,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 -; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6720,7 +7892,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB103_1 +; GCN1-NEXT: s_cbranch_execnz .LBB125_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6740,7 +7912,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 -; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6754,7 +7926,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB103_1 +; GCN2-NEXT: s_cbranch_execnz .LBB125_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6772,7 +7944,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v6, s7 ; GCN3-NEXT: v_mov_b32_e32 v7, s6 -; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -6786,7 +7958,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB103_1 +; GCN3-NEXT: s_cbranch_execnz .LBB125_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -6812,7 +7984,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -6826,7 +7998,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB104_1 +; GCN1-NEXT: s_cbranch_execnz .LBB126_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -6849,7 +8021,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -6863,7 +8035,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB104_1 +; GCN2-NEXT: s_cbranch_execnz .LBB126_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -6884,7 +8056,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -6898,7 +8070,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB104_1 +; GCN3-NEXT: s_cbranch_execnz .LBB126_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -6926,7 +8098,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: v_mov_b32_e32 v6, s3 ; GCN1-NEXT: v_mov_b32_e32 v7, s2 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 -; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6940,7 +8112,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB105_1 +; GCN1-NEXT: s_cbranch_execnz .LBB127_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; @@ -6956,7 +8128,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: v_mov_b32_e32 v6, s3 ; GCN2-NEXT: v_mov_b32_e32 v7, s2 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 -; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -6970,7 +8142,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB105_1 +; GCN2-NEXT: s_cbranch_execnz .LBB127_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; @@ -6986,7 +8158,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: v_mov_b32_e32 v6, s3 ; GCN3-NEXT: v_mov_b32_e32 v7, s2 ; GCN3-NEXT: v_mov_b32_e32 v4, s0 -; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] @@ -7000,7 +8172,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) { ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB105_1 +; GCN3-NEXT: s_cbranch_execnz .LBB127_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: @@ -7022,7 +8194,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b64 s[0:1], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: v_mov_b32_e32 v5, s4 -; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v9, v3 @@ -7036,7 +8208,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN1-NEXT: s_cbranch_execnz .LBB106_1 +; GCN1-NEXT: s_cbranch_execnz .LBB128_1 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN1-NEXT: v_mov_b32_e32 v0, s2 @@ -7057,7 +8229,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b64 s[0:1], 0 ; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: v_mov_b32_e32 v5, s4 -; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v9, v3 @@ -7071,7 +8243,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN2-NEXT: s_cbranch_execnz .LBB106_1 +; GCN2-NEXT: s_cbranch_execnz .LBB128_1 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN2-NEXT: v_mov_b32_e32 v0, s2 @@ -7092,7 +8264,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: v_mov_b32_e32 v5, s4 -; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start +; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v9, v3 @@ -7106,7 +8278,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GCN3-NEXT: s_cbranch_execnz .LBB106_1 +; GCN3-NEXT: s_cbranch_execnz .LBB128_1 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN3-NEXT: v_mov_b32_e32 v0, s2 @@ -7120,6 +8292,182 @@ entry: ret void } +define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB129_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB129_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB129_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB130_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB130_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB130_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw uinc_wrap ; --------------------------------------------------------------------- @@ -7416,6 +8764,72 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ret i64 %result } +define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -7711,3 +9125,71 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst ret i64 %result } + +define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr %out, i64 %in) { +; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr %out, i64 4 + %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 5889de7faf3e5e..d10e049444d685 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1933,6 +1933,646 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ret void } +define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_atomic_fadd_ret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB12_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_ret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_ret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_atomic_fadd_noret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB13_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB14_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB15_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 4598d23e088b54..4ec3ac25b2f1ab 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -353,6 +353,79 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f32 ; --------------------------------------------------------------------- @@ -703,6 +776,79 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace( ret float %result } +define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) { +; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, float %in) { +; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -1053,6 +1199,79 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1440,6 +1659,79 @@ define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } +define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1790,25 +2082,98 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_nand_i32_noret: +define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, v4, v2 -; SI-NEXT: v_not_b32_e32 v3, v3 +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: v_not_b32_e32 v3, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1819,7 +2184,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1830,7 +2195,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1842,7 +2207,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: s_cbranch_execnz .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1852,7 +2217,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1864,7 +2229,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1882,7 +2247,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1897,7 +2262,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: s_cbranch_execnz .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1910,7 +2275,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1922,7 +2287,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: s_cbranch_execnz .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1932,7 +2297,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 @@ -1944,7 +2309,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1963,7 +2328,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -1978,7 +2343,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: s_cbranch_execnz .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -1990,7 +2355,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -2002,7 +2367,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: s_cbranch_execnz .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -2013,7 +2378,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2025,7 +2390,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -2044,7 +2409,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -2059,7 +2424,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: s_cbranch_execnz .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -2073,7 +2438,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -2085,7 +2450,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: s_cbranch_execnz .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2095,7 +2460,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -2107,7 +2472,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -2132,7 +2497,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2147,7 +2512,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -2165,7 +2530,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2177,7 +2542,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2188,7 +2553,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2200,7 +2565,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2223,7 +2588,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v1 @@ -2238,7 +2603,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: s_cbranch_execnz .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -2258,7 +2623,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, s6, v3 @@ -2270,7 +2635,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2281,7 +2646,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2293,7 +2658,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2317,7 +2682,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2332,7 +2697,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: s_cbranch_execnz .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -2352,7 +2717,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2364,7 +2729,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2375,7 +2740,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2387,7 +2752,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2410,7 +2775,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: .LBB58_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -2425,7 +2790,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: s_cbranch_execnz .LBB58_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -2445,7 +2810,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: .LBB58_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -2457,7 +2822,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: s_cbranch_execnz .LBB58_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2468,7 +2833,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -2480,7 +2845,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2489,6 +2854,170 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) ret i32 %result } +define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: v_not_b32_e32 v3, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: v_not_b32_e32 v3, v3 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: v_not_b32_e32 v3, v3 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v5, v2 +; SI-NEXT: v_not_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 +; VI-NEXT: v_not_b32_e32 v0, v0 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: v_not_b32_e32 v3, v3 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2876,95 +3405,168 @@ define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_noret: +define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_noret: +; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i32_noret: +; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_noret_offset: +define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_noret_offset: +; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i32_noret_offset: +; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 - %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst - ret void + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } -define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_xor_i32_ret: +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i32_ret: +; VI-LABEL: global_atomic_xor_i32_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_noret_offset: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst + ret void +} + +define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_ret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc @@ -3263,6 +3865,79 @@ define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) { ret i32 %result } +define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -3277,7 +3952,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3291,7 +3966,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3302,7 +3977,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3313,7 +3988,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: s_cbranch_execnz .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3323,7 +3998,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3334,7 +4009,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3352,7 +4027,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3366,7 +4041,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3379,7 +4054,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3390,7 +4065,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: s_cbranch_execnz .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3400,7 +4075,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 @@ -3411,7 +4086,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3430,7 +4105,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB69_1: ; %atomicrmw.start +; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3444,7 +4119,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB69_1 +; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -3456,7 +4131,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB69_1: ; %atomicrmw.start +; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -3467,7 +4142,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB69_1 +; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -3478,7 +4153,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3489,7 +4164,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -3508,7 +4183,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -3522,7 +4197,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -3536,7 +4211,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -3547,7 +4222,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: s_cbranch_execnz .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3557,7 +4232,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -3568,7 +4243,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -3593,7 +4268,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s34, v1 @@ -3607,7 +4282,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -3625,7 +4300,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3636,7 +4311,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3647,7 +4322,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 @@ -3658,7 +4333,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3681,7 +4356,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_i32_e32 v0, s34, v1 @@ -3695,7 +4370,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -3715,7 +4390,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i32_e32 v2, s6, v3 @@ -3726,7 +4401,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3737,7 +4412,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 @@ -3748,7 +4423,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3772,7 +4447,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -3786,7 +4461,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: s_cbranch_execnz .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -3806,7 +4481,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -3817,7 +4492,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3828,7 +4503,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -3839,7 +4514,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3862,7 +4537,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -3876,7 +4551,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -3896,7 +4571,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -3907,7 +4582,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3918,7 +4593,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -3929,7 +4604,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3954,7 +4629,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -3967,7 +4642,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -3988,7 +4663,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3998,7 +4673,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4016,7 +4691,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4026,7 +4701,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4053,7 +4728,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4066,7 +4741,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4094,7 +4769,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_i32_e32 v2, s4, v3 @@ -4104,7 +4779,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4127,7 +4802,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 @@ -4137,7 +4812,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4167,7 +4842,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4180,7 +4855,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: s_cbranch_execnz .LBB93_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4199,7 +4874,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4209,7 +4884,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: s_cbranch_execnz .LBB93_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4227,7 +4902,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc @@ -4237,7 +4912,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4263,7 +4938,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: .LBB94_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -4276,7 +4951,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: s_cbranch_execnz .LBB94_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4302,7 +4977,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: .LBB94_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_i32_e32 v2, s4, v3 @@ -4312,7 +4987,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: s_cbranch_execnz .LBB94_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4335,7 +5010,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3 @@ -4345,7 +5020,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4358,49 +5033,47 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { -; SI-LABEL: global_atomic_umax_i32_noret: +define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: .LBB95_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: v_max_i32_e32 v3, v4, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i32_noret: +; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: .LBB95_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: v_max_i32_e32 v3, v4, v2 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4408,73 +5081,233 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: s_cbranch_execnz .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umax_i32_noret: +; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 -; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { -; SI-LABEL: global_atomic_umax_i32_noret_offset: +define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB80_1: ; %atomicrmw.start +; SI-NEXT: .LBB96_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_max_u32_e32 v3, v4, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v6, v4 ; SI-NEXT: v_mov_b32_e32 v5, v3 -; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cbranch_execnz .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i32_noret_offset: +; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_max_i32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_max_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst + ret void +} + +define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret_offset: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB80_1: ; %atomicrmw.start +; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v3, v4, v2 @@ -4485,7 +5318,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cbranch_execnz .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4495,7 +5328,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 @@ -4506,7 +5339,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4525,7 +5358,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB81_1: ; %atomicrmw.start +; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4539,7 +5372,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -4551,7 +5384,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB81_1: ; %atomicrmw.start +; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -4562,7 +5395,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -4573,7 +5406,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4584,7 +5417,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -4603,7 +5436,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB82_1: ; %atomicrmw.start +; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -4617,7 +5450,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -4631,7 +5464,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB82_1: ; %atomicrmw.start +; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -4642,7 +5475,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_cbranch_execnz .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4652,7 +5485,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -4663,7 +5496,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -4688,7 +5521,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB83_1: ; %atomicrmw.start +; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s34, v1 @@ -4702,7 +5535,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -4720,7 +5553,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB83_1: ; %atomicrmw.start +; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4731,7 +5564,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4742,7 +5575,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 @@ -4753,7 +5586,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4776,7 +5609,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB84_1: ; %atomicrmw.start +; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_max_u32_e32 v0, s34, v1 @@ -4790,7 +5623,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -4810,7 +5643,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB84_1: ; %atomicrmw.start +; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u32_e32 v2, s6, v3 @@ -4821,7 +5654,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4832,7 +5665,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 @@ -4843,7 +5676,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4867,7 +5700,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB85_1: ; %atomicrmw.start +; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4881,7 +5714,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cbranch_execnz .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -4901,7 +5734,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB85_1: ; %atomicrmw.start +; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -4912,7 +5745,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4923,7 +5756,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -4934,7 +5767,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4957,7 +5790,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB86_1: ; %atomicrmw.start +; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -4971,7 +5804,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -4991,7 +5824,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB86_1: ; %atomicrmw.start +; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5002,7 +5835,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5013,7 +5846,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5024,7 +5857,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5049,7 +5882,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB87_1: ; %atomicrmw.start +; SI-NEXT: .LBB105_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5062,7 +5895,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cbranch_execnz .LBB105_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5083,7 +5916,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB87_1: ; %atomicrmw.start +; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_max_u32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -5093,7 +5926,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cbranch_execnz .LBB105_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5111,7 +5944,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5121,7 +5954,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5148,7 +5981,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB88_1: ; %atomicrmw.start +; SI-NEXT: .LBB106_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5161,7 +5994,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cbranch_execnz .LBB106_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5189,7 +6022,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB88_1: ; %atomicrmw.start +; VI-NEXT: .LBB106_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_u32_e32 v2, s4, v3 @@ -5199,7 +6032,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cbranch_execnz .LBB106_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5222,7 +6055,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 @@ -5232,7 +6065,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5263,7 +6096,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB89_1: ; %atomicrmw.start +; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_max_u32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -5276,7 +6109,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5302,7 +6135,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB89_1: ; %atomicrmw.start +; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_max_u32_e32 v2, s4, v3 @@ -5312,7 +6145,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5335,7 +6168,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3 @@ -5345,7 +6178,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -5358,6 +6191,164 @@ entry: ret void } +define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_max_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_max_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_max_u32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_max_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5372,7 +6363,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB90_1: ; %atomicrmw.start +; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5386,7 +6377,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cbranch_execnz .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5397,7 +6388,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB90_1: ; %atomicrmw.start +; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5408,7 +6399,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cbranch_execnz .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5418,7 +6409,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5429,7 +6420,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5447,7 +6438,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB91_1: ; %atomicrmw.start +; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5461,7 +6452,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5474,7 +6465,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB91_1: ; %atomicrmw.start +; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5485,7 +6476,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cbranch_execnz .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5495,7 +6486,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 @@ -5506,7 +6497,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5525,7 +6516,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB92_1: ; %atomicrmw.start +; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -5539,7 +6530,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -5551,7 +6542,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB92_1: ; %atomicrmw.start +; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -5562,7 +6553,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -5573,7 +6564,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -5584,7 +6575,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -5603,7 +6594,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB93_1: ; %atomicrmw.start +; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -5617,7 +6608,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -5631,7 +6622,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -5642,7 +6633,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cbranch_execnz .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5652,7 +6643,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -5663,7 +6654,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -5688,7 +6679,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v0, s34, v1 @@ -5702,7 +6693,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -5720,7 +6711,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v2, s6, v3 @@ -5731,7 +6722,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5742,7 +6733,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 @@ -5753,7 +6744,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5776,7 +6767,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_u32_e32 v0, s34, v1 @@ -5790,7 +6781,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cbranch_execnz .LBB115_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -5810,7 +6801,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB95_1: ; %atomicrmw.start +; VI-NEXT: .LBB115_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_u32_e32 v2, s6, v3 @@ -5821,7 +6812,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cbranch_execnz .LBB115_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5832,7 +6823,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1 @@ -5843,7 +6834,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5867,7 +6858,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB96_1: ; %atomicrmw.start +; SI-NEXT: .LBB116_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -5881,7 +6872,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_cbranch_execnz .LBB116_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -5901,7 +6892,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: .LBB116_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -5912,7 +6903,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_cbranch_execnz .LBB116_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5923,7 +6914,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -5934,7 +6925,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5957,7 +6948,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: .LBB117_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -5971,7 +6962,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cbranch_execnz .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -5991,45 +6982,203 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_min_u32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst + ret i32 %result +} + +define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_u32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_u32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_min_u32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_min_u32_e32 v3, s6, v4 -; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_min_u32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 -; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar: +; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_u32_e32 v2, s6, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_min_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 - %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret i32 %result } @@ -6047,7 +7196,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6061,7 +7210,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cbranch_execnz .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6072,7 +7221,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB98_1: ; %atomicrmw.start +; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6083,7 +7232,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cbranch_execnz .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6093,7 +7242,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6104,7 +7253,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6122,7 +7271,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB99_1: ; %atomicrmw.start +; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6136,7 +7285,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6149,7 +7298,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB99_1: ; %atomicrmw.start +; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6160,7 +7309,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v4, v3 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cbranch_execnz .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6170,7 +7319,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 @@ -6181,7 +7330,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6200,7 +7349,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB100_1: ; %atomicrmw.start +; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -6214,7 +7363,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -6226,7 +7375,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB100_1: ; %atomicrmw.start +; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 @@ -6237,7 +7386,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v3 @@ -6248,7 +7397,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -6259,7 +7408,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -6278,7 +7427,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB101_1: ; %atomicrmw.start +; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 @@ -6292,7 +7441,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: v_mov_b32_e32 v0, v3 @@ -6306,7 +7455,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[3:4] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB101_1: ; %atomicrmw.start +; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 @@ -6317,7 +7466,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cbranch_execnz .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6327,7 +7476,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 @@ -6338,7 +7487,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 @@ -6363,7 +7512,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB102_1: ; %atomicrmw.start +; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v0, s34, v1 @@ -6377,7 +7526,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -6395,7 +7544,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB102_1: ; %atomicrmw.start +; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v2, s6, v3 @@ -6406,7 +7555,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6417,7 +7566,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 @@ -6428,7 +7577,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6451,7 +7600,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB103_1: ; %atomicrmw.start +; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_min_i32_e32 v0, s34, v1 @@ -6465,7 +7614,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v4, 1 @@ -6485,7 +7634,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: v_mov_b32_e32 v1, s35 ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB103_1: ; %atomicrmw.start +; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_min_i32_e32 v2, s6, v3 @@ -6496,7 +7645,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6507,7 +7656,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 @@ -6518,7 +7667,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6542,7 +7691,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB104_1: ; %atomicrmw.start +; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -6556,7 +7705,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cbranch_execnz .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -6576,7 +7725,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB104_1: ; %atomicrmw.start +; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -6587,7 +7736,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6598,7 +7747,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6609,7 +7758,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6632,7 +7781,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, v0 @@ -6646,7 +7795,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v3, 1 @@ -6666,7 +7815,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s35 ; VI-NEXT: flat_load_dword v0, v[1:2] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v0 @@ -6677,7 +7826,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6688,7 +7837,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, v0 @@ -6699,7 +7848,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6724,7 +7873,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6737,7 +7886,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6758,7 +7907,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6768,7 +7917,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6786,7 +7935,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6796,7 +7945,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -6823,7 +7972,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: .LBB129_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s8, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6836,7 +7985,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: s_cbranch_execnz .LBB129_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -6864,7 +8013,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: .LBB129_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: v_min_i32_e32 v2, s4, v3 @@ -6874,7 +8023,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: s_cbranch_execnz .LBB129_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -6897,7 +8046,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 @@ -6907,7 +8056,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -6933,7 +8082,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: .LBB130_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_min_i32_e32 v0, s2, v1 ; SI-NEXT: s_waitcnt expcnt(0) @@ -6946,7 +8095,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v2 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: s_cbranch_execnz .LBB130_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -6961,7 +8110,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: .LBB130_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_min_i32_e32 v2, s2, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6971,7 +8120,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: s_cbranch_execnz .LBB130_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -6985,7 +8134,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc @@ -6995,7 +8144,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7017,102 +8166,260 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: s_load_dword s6, s[4:5], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .LBB131_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_min_i32_e32 v0, s8, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: atomic_min_i32_ret_addr64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_ashr_i32 s7, s5, 31 +; VI-NEXT: s_mov_b32 s6, s5 +; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 +; VI-NEXT: s_add_u32 s6, s0, s6 +; VI-NEXT: s_addc_u32 s7, s1, s7 +; VI-NEXT: s_load_dword s5, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB131_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_min_i32_e32 v2, s4, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_min_i32_ret_addr64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_mov_b32 s0, s3 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: s_endpgm +entry: + %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index + %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst + store i32 %tmp0, ptr addrspace(1) %out2 + ret void +} + +define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB132_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_i32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB132_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_i32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB133_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_min_i32_e32 v0, s8, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SI-NEXT: v_min_i32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 -; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, v2 -; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB133_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: atomic_min_i32_ret_addr64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s7, s5, 31 -; VI-NEXT: s_mov_b32 s6, s5 -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2 -; VI-NEXT: s_add_u32 s6, s0, s6 -; VI-NEXT: s_addc_u32 s7, s1, s7 -; VI-NEXT: s_load_dword s5, s[6:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB133_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: v_min_i32_e32 v2, s4, v3 -; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_min_i32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB133_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: atomic_min_i32_ret_addr64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_mov_b32 s0, s3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX9-NEXT: s_add_u32 s0, s4, s0 -; GFX9-NEXT: s_addc_u32 s1, s5, s1 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_min_i32_e32 v2, s2, v3 -; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_min_i32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_store_dword v1, v0, s[6:7] -; GFX9-NEXT: s_endpgm -entry: - %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index - %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst - store i32 %tmp0, ptr addrspace(1) %out2 - ret void +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result } ; --------------------------------------------------------------------- @@ -7465,6 +8772,79 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ret i32 %result } +define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -7814,3 +9194,78 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst ret i32 %result } + +define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index 380ce7f3b93988..f5c2bd6286cb8e 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -367,6 +367,80 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw xchg f64 ; --------------------------------------------------------------------- @@ -731,6 +805,80 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace ret double %result } +define void @global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) { +; SI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define double @global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, double %in) { +; SI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %result +} + ; --------------------------------------------------------------------- ; atomicrmw add ; --------------------------------------------------------------------- @@ -1095,6 +1243,80 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_add_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw sub ; --------------------------------------------------------------------- @@ -1459,6 +1681,80 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw and ; --------------------------------------------------------------------- @@ -1823,21 +2119,95 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw nand -; --------------------------------------------------------------------- - -define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_nand_i64_noret: +define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw nand +; --------------------------------------------------------------------- + +define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1857,7 +2227,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB40_1 +; SI-NEXT: s_cbranch_execnz .LBB50_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1868,7 +2238,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI-NEXT: .LBB50_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1883,7 +2253,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: s_cbranch_execnz .LBB50_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1893,7 +2263,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1908,7 +2278,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1926,7 +2296,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: .LBB51_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1946,7 +2316,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: s_cbranch_execnz .LBB51_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -1959,7 +2329,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: .LBB51_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1974,7 +2344,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: s_cbranch_execnz .LBB51_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1984,7 +2354,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 @@ -1999,7 +2369,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2022,7 +2392,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: .LBB52_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2042,7 +2412,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: s_cbranch_execnz .LBB52_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -2053,7 +2423,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: .LBB52_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -2068,7 +2438,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: s_cbranch_execnz .LBB52_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -2080,7 +2450,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2095,7 +2465,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -2119,7 +2489,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: .LBB53_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -2139,7 +2509,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: s_cbranch_execnz .LBB53_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -2152,7 +2522,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: .LBB53_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -2167,7 +2537,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: s_cbranch_execnz .LBB53_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2177,7 +2547,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -2192,7 +2562,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -2219,7 +2589,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: .LBB54_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v3 @@ -2239,7 +2609,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: s_cbranch_execnz .LBB54_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 @@ -2259,7 +2629,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: .LBB54_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2274,7 +2644,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: s_cbranch_execnz .LBB54_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2285,7 +2655,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2300,7 +2670,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2324,7 +2694,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: .LBB55_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s34, v3 @@ -2344,7 +2714,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: s_cbranch_execnz .LBB55_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v8, 1 @@ -2364,7 +2734,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: .LBB55_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2379,7 +2749,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: s_cbranch_execnz .LBB55_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2390,7 +2760,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3 @@ -2405,7 +2775,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2430,7 +2800,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: .LBB56_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2450,7 +2820,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: s_cbranch_execnz .LBB56_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 @@ -2470,7 +2840,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: .LBB56_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2485,7 +2855,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: s_cbranch_execnz .LBB56_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2496,7 +2866,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2511,7 +2881,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2535,7 +2905,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 ; SI-NEXT: s_mov_b64 s[36:37], 0 -; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: .LBB57_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v1 @@ -2555,7 +2925,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: s_cbranch_execnz .LBB57_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v6, 1 @@ -2575,7 +2945,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_mov_b64 s[34:35], 0 -; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: .LBB57_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v1 @@ -2590,7 +2960,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: s_cbranch_execnz .LBB57_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2601,7 +2971,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v1 @@ -2616,7 +2986,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2625,6 +2995,196 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, v6, v2 +; SI-NEXT: v_not_b32_e32 v5, v4 +; SI-NEXT: v_not_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v4, v7, v3 +; VI-NEXT: v_and_b32_e32 v8, v6, v2 +; VI-NEXT: v_not_b32_e32 v5, v4 +; VI-NEXT: v_not_b32_e32 v4, v8 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v0, v11, v6 +; SI-NEXT: v_and_b32_e32 v1, v10, v7 +; SI-NEXT: v_not_b32_e32 v9, v0 +; SI-NEXT: v_not_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v0, v9, v3 +; VI-NEXT: v_and_b32_e32 v1, v8, v2 +; VI-NEXT: v_not_b32_e32 v7, v0 +; VI-NEXT: v_not_b32_e32 v6, v1 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw or ; --------------------------------------------------------------------- @@ -2989,36 +3549,110 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw xor -; --------------------------------------------------------------------- - -define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_xor_i64_noret: +define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_xor_i64_noret: +; VI-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_xor_i64_noret: +; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw xor +; --------------------------------------------------------------------- + +define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3353,6 +3987,80 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } +define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw max ; --------------------------------------------------------------------- @@ -3367,7 +4075,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: .LBB80_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3386,7 +4094,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: s_cbranch_execnz .LBB80_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3397,7 +4105,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: .LBB80_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3411,7 +4119,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: s_cbranch_execnz .LBB80_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3421,7 +4129,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3435,7 +4143,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: s_cbranch_execnz .LBB80_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3453,7 +4161,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: .LBB81_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3472,7 +4180,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: s_cbranch_execnz .LBB81_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3485,7 +4193,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: .LBB81_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3499,7 +4207,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: s_cbranch_execnz .LBB81_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3509,7 +4217,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] @@ -3523,7 +4231,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: s_cbranch_execnz .LBB81_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3546,7 +4254,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: .LBB82_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -3565,7 +4273,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: s_cbranch_execnz .LBB82_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3576,7 +4284,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: .LBB82_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -3590,7 +4298,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: s_cbranch_execnz .LBB82_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -3602,7 +4310,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3616,7 +4324,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: s_cbranch_execnz .LBB82_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -3640,7 +4348,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: .LBB83_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -3659,7 +4367,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: s_cbranch_execnz .LBB83_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -3672,7 +4380,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: .LBB83_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -3686,7 +4394,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: s_cbranch_execnz .LBB83_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3696,7 +4404,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -3710,7 +4418,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: s_cbranch_execnz .LBB83_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -3739,7 +4447,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: .LBB84_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] @@ -3758,7 +4466,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: s_cbranch_execnz .LBB84_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3780,7 +4488,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: .LBB84_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3794,7 +4502,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: s_cbranch_execnz .LBB84_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3807,7 +4515,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3821,7 +4529,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: s_cbranch_execnz .LBB84_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3847,7 +4555,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB69_1: ; %atomicrmw.start +; SI-NEXT: .LBB85_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3] @@ -3866,7 +4574,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB69_1 +; SI-NEXT: s_cbranch_execnz .LBB85_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3888,7 +4596,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB69_1: ; %atomicrmw.start +; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3902,7 +4610,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB69_1 +; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3915,7 +4623,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] @@ -3929,7 +4637,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: s_cbranch_execnz .LBB85_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3956,7 +4664,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: .LBB86_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -3975,7 +4683,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: s_cbranch_execnz .LBB86_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -3997,7 +4705,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: .LBB86_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4011,7 +4719,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: s_cbranch_execnz .LBB86_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4024,7 +4732,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -4038,7 +4746,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: s_cbranch_execnz .LBB86_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4064,7 +4772,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: .LBB87_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -4083,7 +4791,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: s_cbranch_execnz .LBB87_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -4105,7 +4813,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: .LBB87_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4119,7 +4827,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: s_cbranch_execnz .LBB87_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4132,7 +4840,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -4146,7 +4854,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: s_cbranch_execnz .LBB87_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4173,7 +4881,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: .LBB88_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4191,7 +4899,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: s_cbranch_execnz .LBB88_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4214,7 +4922,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: .LBB88_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -4227,7 +4935,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: s_cbranch_execnz .LBB88_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4247,7 +4955,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4260,7 +4968,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: s_cbranch_execnz .LBB88_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4287,7 +4995,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: .LBB89_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -4305,7 +5013,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: s_cbranch_execnz .LBB89_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4333,7 +5041,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: .LBB89_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -4346,7 +5054,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: s_cbranch_execnz .LBB89_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4369,7 +5077,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -4382,7 +5090,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: s_cbranch_execnz .LBB89_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -4414,7 +5122,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: .LBB90_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4432,7 +5140,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: s_cbranch_execnz .LBB90_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -4453,7 +5161,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: .LBB90_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -4466,7 +5174,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: s_cbranch_execnz .LBB90_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -4486,7 +5194,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -4499,7 +5207,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: s_cbranch_execnz .LBB90_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -4525,7 +5233,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: .LBB91_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -4543,7 +5251,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: s_cbranch_execnz .LBB91_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -4569,7 +5277,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -4582,7 +5290,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: s_cbranch_execnz .LBB91_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -4605,7 +5313,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -4618,7 +5326,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -4631,24 +5339,20 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw umax -; --------------------------------------------------------------------- - -define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_umax_i64_noret: +define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: .LBB92_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) @@ -4656,7 +5360,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] @@ -4664,21 +5368,23 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: s_cbranch_execnz .LBB92_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: global_atomic_umax_i64_noret: +; VI-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: .LBB92_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc @@ -4689,23 +5395,23 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: s_cbranch_execnz .LBB92_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_atomic_umax_i64_noret: +; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory_access: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] @@ -4713,32 +5419,218 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: s_cbranch_execnz .LBB92_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 ret void } -define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { -; SI-LABEL: global_atomic_umax_i64_noret_offset: +define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: .LBB93_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] -; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw umax +; --------------------------------------------------------------------- + +define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_noret: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_noret: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst + ret void +} + +define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret_offset: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 @@ -4750,7 +5642,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: s_cbranch_execnz .LBB95_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4763,7 +5655,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: .LBB95_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4777,7 +5669,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: s_cbranch_execnz .LBB95_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4787,7 +5679,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] @@ -4801,7 +5693,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: s_cbranch_execnz .LBB95_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4824,7 +5716,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: .LBB96_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4843,7 +5735,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: s_cbranch_execnz .LBB96_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4854,7 +5746,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: .LBB96_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -4868,7 +5760,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: s_cbranch_execnz .LBB96_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -4880,7 +5772,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4894,7 +5786,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: s_cbranch_execnz .LBB96_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -4918,7 +5810,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: .LBB97_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -4937,7 +5829,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: s_cbranch_execnz .LBB97_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -4950,7 +5842,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: .LBB97_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -4964,7 +5856,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: s_cbranch_execnz .LBB97_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4974,7 +5866,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -4988,7 +5880,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: s_cbranch_execnz .LBB97_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -5017,7 +5909,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB80_1: ; %atomicrmw.start +; SI-NEXT: .LBB98_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] @@ -5036,7 +5928,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB80_1 +; SI-NEXT: s_cbranch_execnz .LBB98_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5058,7 +5950,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB80_1: ; %atomicrmw.start +; VI-NEXT: .LBB98_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5072,7 +5964,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB80_1 +; VI-NEXT: s_cbranch_execnz .LBB98_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5085,7 +5977,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5099,7 +5991,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB80_1 +; GFX9-NEXT: s_cbranch_execnz .LBB98_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5125,7 +6017,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB81_1: ; %atomicrmw.start +; SI-NEXT: .LBB99_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3] @@ -5144,7 +6036,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: s_cbranch_execnz .LBB99_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5166,7 +6058,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB81_1: ; %atomicrmw.start +; VI-NEXT: .LBB99_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5180,7 +6072,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: s_cbranch_execnz .LBB99_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5193,7 +6085,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] @@ -5207,7 +6099,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: s_cbranch_execnz .LBB99_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5234,7 +6126,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB82_1: ; %atomicrmw.start +; SI-NEXT: .LBB100_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -5253,7 +6145,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: s_cbranch_execnz .LBB100_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5275,7 +6167,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB82_1: ; %atomicrmw.start +; VI-NEXT: .LBB100_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -5289,7 +6181,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: s_cbranch_execnz .LBB100_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5302,7 +6194,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -5316,7 +6208,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: s_cbranch_execnz .LBB100_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5342,7 +6234,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB83_1: ; %atomicrmw.start +; SI-NEXT: .LBB101_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -5361,7 +6253,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB83_1 +; SI-NEXT: s_cbranch_execnz .LBB101_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -5383,7 +6275,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB83_1: ; %atomicrmw.start +; VI-NEXT: .LBB101_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -5397,7 +6289,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB83_1 +; VI-NEXT: s_cbranch_execnz .LBB101_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5410,7 +6302,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -5424,7 +6316,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB83_1 +; GFX9-NEXT: s_cbranch_execnz .LBB101_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5451,7 +6343,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB84_1: ; %atomicrmw.start +; SI-NEXT: .LBB102_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -5469,7 +6361,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB84_1 +; SI-NEXT: s_cbranch_execnz .LBB102_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -5492,7 +6384,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB84_1: ; %atomicrmw.start +; VI-NEXT: .LBB102_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -5505,7 +6397,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB84_1 +; VI-NEXT: s_cbranch_execnz .LBB102_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -5525,7 +6417,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -5538,7 +6430,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB84_1 +; GFX9-NEXT: s_cbranch_execnz .LBB102_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -5565,7 +6457,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB85_1: ; %atomicrmw.start +; SI-NEXT: .LBB103_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -5583,7 +6475,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB85_1 +; SI-NEXT: s_cbranch_execnz .LBB103_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5611,7 +6503,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB85_1: ; %atomicrmw.start +; VI-NEXT: .LBB103_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -5624,7 +6516,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB85_1 +; VI-NEXT: s_cbranch_execnz .LBB103_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5647,7 +6539,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -5660,7 +6552,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB85_1 +; GFX9-NEXT: s_cbranch_execnz .LBB103_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -5691,7 +6583,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB86_1: ; %atomicrmw.start +; SI-NEXT: .LBB104_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -5709,7 +6601,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB86_1 +; SI-NEXT: s_cbranch_execnz .LBB104_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -5735,7 +6627,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB86_1: ; %atomicrmw.start +; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -5748,7 +6640,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB86_1 +; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -5771,7 +6663,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -5784,7 +6676,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB86_1 +; GFX9-NEXT: s_cbranch_execnz .LBB104_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -5797,6 +6689,190 @@ entry: ret void } +define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw umin ; --------------------------------------------------------------------- @@ -5811,7 +6887,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB87_1: ; %atomicrmw.start +; SI-NEXT: .LBB107_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5830,7 +6906,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB87_1 +; SI-NEXT: s_cbranch_execnz .LBB107_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5841,7 +6917,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB87_1: ; %atomicrmw.start +; VI-NEXT: .LBB107_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5855,7 +6931,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB87_1 +; VI-NEXT: s_cbranch_execnz .LBB107_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5865,7 +6941,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5879,7 +6955,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB87_1 +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5897,7 +6973,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB88_1: ; %atomicrmw.start +; SI-NEXT: .LBB108_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5916,7 +6992,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB88_1 +; SI-NEXT: s_cbranch_execnz .LBB108_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -5929,7 +7005,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB88_1: ; %atomicrmw.start +; VI-NEXT: .LBB108_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5943,7 +7019,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB88_1 +; VI-NEXT: s_cbranch_execnz .LBB108_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -5953,7 +7029,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] @@ -5967,7 +7043,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB88_1 +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5990,7 +7066,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB89_1: ; %atomicrmw.start +; SI-NEXT: .LBB109_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6009,7 +7085,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB89_1 +; SI-NEXT: s_cbranch_execnz .LBB109_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6020,7 +7096,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB89_1: ; %atomicrmw.start +; VI-NEXT: .LBB109_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -6034,7 +7110,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB89_1 +; VI-NEXT: s_cbranch_execnz .LBB109_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -6046,7 +7122,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6060,7 +7136,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB89_1 +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6084,7 +7160,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB90_1: ; %atomicrmw.start +; SI-NEXT: .LBB110_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6103,7 +7179,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB90_1 +; SI-NEXT: s_cbranch_execnz .LBB110_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6116,7 +7192,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB90_1: ; %atomicrmw.start +; VI-NEXT: .LBB110_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6130,7 +7206,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB90_1 +; VI-NEXT: s_cbranch_execnz .LBB110_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6140,7 +7216,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6154,7 +7230,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB90_1 +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6183,7 +7259,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB91_1: ; %atomicrmw.start +; SI-NEXT: .LBB111_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] @@ -6202,7 +7278,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB91_1 +; SI-NEXT: s_cbranch_execnz .LBB111_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6224,7 +7300,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB91_1: ; %atomicrmw.start +; VI-NEXT: .LBB111_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6238,7 +7314,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB91_1 +; VI-NEXT: s_cbranch_execnz .LBB111_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6251,7 +7327,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6265,7 +7341,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB91_1 +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6291,7 +7367,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB92_1: ; %atomicrmw.start +; SI-NEXT: .LBB112_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3] @@ -6310,7 +7386,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB92_1 +; SI-NEXT: s_cbranch_execnz .LBB112_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6332,7 +7408,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB92_1: ; %atomicrmw.start +; VI-NEXT: .LBB112_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6346,7 +7422,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB92_1 +; VI-NEXT: s_cbranch_execnz .LBB112_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6359,7 +7435,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3] @@ -6373,7 +7449,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB92_1 +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6400,7 +7476,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB93_1: ; %atomicrmw.start +; SI-NEXT: .LBB113_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -6419,7 +7495,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB93_1 +; SI-NEXT: s_cbranch_execnz .LBB113_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6441,7 +7517,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB93_1: ; %atomicrmw.start +; VI-NEXT: .LBB113_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6455,7 +7531,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB93_1 +; VI-NEXT: s_cbranch_execnz .LBB113_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6468,7 +7544,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6482,7 +7558,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB93_1 +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6508,7 +7584,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB94_1: ; %atomicrmw.start +; SI-NEXT: .LBB114_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -6527,7 +7603,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB94_1 +; SI-NEXT: s_cbranch_execnz .LBB114_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -6549,7 +7625,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB94_1: ; %atomicrmw.start +; VI-NEXT: .LBB114_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6563,7 +7639,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB94_1 +; VI-NEXT: s_cbranch_execnz .LBB114_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6576,7 +7652,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -6590,7 +7666,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB94_1 +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6599,24 +7675,20 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) ret i64 %result } -; --------------------------------------------------------------------- -; atomicrmw min -; --------------------------------------------------------------------- - -define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_min_i64_noret: +define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB95_1: ; %atomicrmw.start +; SI-NEXT: .LBB115_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) @@ -6624,7 +7696,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v10, v6 ; SI-NEXT: v_mov_b32_e32 v9, v5 ; SI-NEXT: v_mov_b32_e32 v8, v4 -; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] @@ -6632,7 +7704,195 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB95_1 +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw min +; --------------------------------------------------------------------- + +define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_min_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB117_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6643,7 +7903,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB95_1: ; %atomicrmw.start +; VI-NEXT: .LBB117_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6657,7 +7917,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB95_1 +; VI-NEXT: s_cbranch_execnz .LBB117_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6667,7 +7927,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6681,7 +7941,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB95_1 +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6699,7 +7959,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB96_1: ; %atomicrmw.start +; SI-NEXT: .LBB118_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6718,7 +7978,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: v_mov_b32_e32 v7, v9 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB96_1 +; SI-NEXT: s_cbranch_execnz .LBB118_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6731,7 +7991,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB96_1: ; %atomicrmw.start +; VI-NEXT: .LBB118_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6745,7 +8005,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v6, v4 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB96_1 +; VI-NEXT: s_cbranch_execnz .LBB118_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6755,7 +8015,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] @@ -6769,7 +8029,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB96_1 +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6792,7 +8052,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB97_1: ; %atomicrmw.start +; SI-NEXT: .LBB119_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6811,7 +8071,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB97_1 +; SI-NEXT: s_cbranch_execnz .LBB119_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6822,7 +8082,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB97_1: ; %atomicrmw.start +; VI-NEXT: .LBB119_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, v5 @@ -6836,7 +8096,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB97_1 +; VI-NEXT: s_cbranch_execnz .LBB119_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, v4 @@ -6848,7 +8108,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6862,7 +8122,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB97_1 +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6886,7 +8146,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 ; SI-NEXT: s_mov_b64 s[8:9], 0 -; SI-NEXT: .LBB98_1: ; %atomicrmw.start +; SI-NEXT: .LBB120_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, v1 @@ -6905,7 +8165,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB98_1 +; SI-NEXT: s_cbranch_execnz .LBB120_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) @@ -6918,7 +8178,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_mov_b64 s[4:5], 0 -; VI-NEXT: .LBB98_1: ; %atomicrmw.start +; VI-NEXT: .LBB120_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -6932,7 +8192,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB98_1 +; VI-NEXT: s_cbranch_execnz .LBB120_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -6942,7 +8202,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -6956,7 +8216,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB98_1 +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, v4 @@ -6985,7 +8245,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB99_1: ; %atomicrmw.start +; SI-NEXT: .LBB121_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] @@ -7004,7 +8264,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB99_1 +; SI-NEXT: s_cbranch_execnz .LBB121_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7026,7 +8286,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: .LBB99_1: ; %atomicrmw.start +; VI-NEXT: .LBB121_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7040,7 +8300,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB99_1 +; VI-NEXT: s_cbranch_execnz .LBB121_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7053,7 +8313,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7067,7 +8327,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB99_1 +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7093,7 +8353,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB100_1: ; %atomicrmw.start +; SI-NEXT: .LBB122_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3] @@ -7112,7 +8372,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB100_1 +; SI-NEXT: s_cbranch_execnz .LBB122_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7134,7 +8394,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v6, s7 ; VI-NEXT: v_mov_b32_e32 v7, s6 -; VI-NEXT: .LBB100_1: ; %atomicrmw.start +; VI-NEXT: .LBB122_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7148,7 +8408,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB100_1 +; VI-NEXT: s_cbranch_execnz .LBB122_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7161,7 +8421,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] @@ -7175,7 +8435,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace( ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB100_1 +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7202,7 +8462,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB101_1: ; %atomicrmw.start +; SI-NEXT: .LBB123_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -7221,7 +8481,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB101_1 +; SI-NEXT: s_cbranch_execnz .LBB123_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7243,7 +8503,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: .LBB101_1: ; %atomicrmw.start +; VI-NEXT: .LBB123_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -7257,7 +8517,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB101_1 +; VI-NEXT: s_cbranch_execnz .LBB123_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7270,7 +8530,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -7284,7 +8544,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg % ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB101_1 +; GFX9-NEXT: s_cbranch_execnz .LBB123_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7310,7 +8570,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: s_mov_b64 s[36:37], 0 ; SI-NEXT: v_mov_b32_e32 v4, s35 ; SI-NEXT: v_mov_b32_e32 v5, s34 -; SI-NEXT: .LBB102_1: ; %atomicrmw.start +; SI-NEXT: .LBB124_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v1 @@ -7329,7 +8589,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] -; SI-NEXT: s_cbranch_execnz .LBB102_1 +; SI-NEXT: s_cbranch_execnz .LBB124_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v10, 1 @@ -7351,7 +8611,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v5, s6 -; VI-NEXT: .LBB102_1: ; %atomicrmw.start +; VI-NEXT: .LBB124_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v9, v1 @@ -7365,7 +8625,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] -; VI-NEXT: s_cbranch_execnz .LBB102_1 +; VI-NEXT: s_cbranch_execnz .LBB124_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] @@ -7378,7 +8638,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v8, v1 @@ -7392,7 +8652,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GFX9-NEXT: s_cbranch_execnz .LBB102_1 +; GFX9-NEXT: s_cbranch_execnz .LBB124_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -7419,7 +8679,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_mov_b32_e32 v3, s9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB103_1: ; %atomicrmw.start +; SI-NEXT: .LBB125_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7437,7 +8697,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB103_1 +; SI-NEXT: s_cbranch_execnz .LBB125_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7460,7 +8720,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: .LBB103_1: ; %atomicrmw.start +; VI-NEXT: .LBB125_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -7473,7 +8733,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB103_1 +; VI-NEXT: s_cbranch_execnz .LBB125_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7493,7 +8753,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7506,7 +8766,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_cbranch_execnz .LBB103_1 +; GFX9-NEXT: s_cbranch_execnz .LBB125_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7533,7 +8793,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB104_1: ; %atomicrmw.start +; SI-NEXT: .LBB126_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -7551,7 +8811,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB104_1 +; SI-NEXT: s_cbranch_execnz .LBB126_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -7579,7 +8839,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v5, s4 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: .LBB104_1: ; %atomicrmw.start +; VI-NEXT: .LBB126_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -7592,7 +8852,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; VI-NEXT: s_cbranch_execnz .LBB104_1 +; VI-NEXT: s_cbranch_execnz .LBB126_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -7615,7 +8875,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -7628,7 +8888,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB104_1 +; GFX9-NEXT: s_cbranch_execnz .LBB126_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -7658,7 +8918,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: .LBB105_1: ; %atomicrmw.start +; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7676,7 +8936,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: v_mov_b32_e32 v2, v6 ; SI-NEXT: v_mov_b32_e32 v3, v7 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] -; SI-NEXT: s_cbranch_execnz .LBB105_1 +; SI-NEXT: s_cbranch_execnz .LBB127_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; @@ -7693,7 +8953,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: .LBB105_1: ; %atomicrmw.start +; VI-NEXT: .LBB127_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc @@ -7706,7 +8966,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] -; VI-NEXT: s_cbranch_execnz .LBB105_1 +; VI-NEXT: s_cbranch_execnz .LBB127_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; @@ -7722,7 +8982,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc @@ -7735,7 +8995,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB105_1 +; GFX9-NEXT: s_cbranch_execnz .LBB127_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: @@ -7760,7 +9020,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: .LBB106_1: ; %atomicrmw.start +; SI-NEXT: .LBB128_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc @@ -7778,7 +9038,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; SI-NEXT: v_mov_b32_e32 v2, v4 ; SI-NEXT: v_mov_b32_e32 v3, v5 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; SI-NEXT: s_cbranch_execnz .LBB106_1 +; SI-NEXT: s_cbranch_execnz .LBB128_1 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 @@ -7804,7 +9064,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: .LBB106_1: ; %atomicrmw.start +; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v3 ; VI-NEXT: v_mov_b32_e32 v8, v2 @@ -7817,7 +9077,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB106_1 +; VI-NEXT: s_cbranch_execnz .LBB128_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_or_b64 exec, exec, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -7840,7 +9100,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start +; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 @@ -7853,7 +9113,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB106_1 +; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -7866,24 +9126,208 @@ entry: ret void } -; --------------------------------------------------------------------- -; atomicrmw uinc_wrap -; --------------------------------------------------------------------- - -define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { -; SI-LABEL: global_atomic_uinc_wrap_i64_noret: +define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB129_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB129_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB129_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB129_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB129_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB130_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB130_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB130_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB130_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB130_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +; --------------------------------------------------------------------- +; atomicrmw uinc_wrap +; --------------------------------------------------------------------- + +define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_noret: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8230,6 +9674,80 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ret i64 %result } +define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + ; --------------------------------------------------------------------- ; atomicrmw udec_wrap ; --------------------------------------------------------------------- @@ -8593,3 +10111,79 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst ret i64 %result } + +define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 + %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %result +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index d8e527828af52c..794c87b8883139 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -10986,9 +10986,750 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2 +; GFX7LESS-NEXT: .LBB18_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-NEXT: .LBB18_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-NEXT: .LBB18_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-NEXT: .LBB18_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-NEXT: .LBB18_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-NEXT: .LBB18_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX9-DPP-NEXT: .LBB18_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1064-DPP-NEXT: .LBB18_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1032-DPP-NEXT: .LBB18_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1164-DPP-NEXT: .LBB18_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2 +; GFX1132-DPP-NEXT: .LBB18_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2 +; GFX7LESS-NEXT: .LBB19_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-NEXT: .LBB19_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-NEXT: .LBB19_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-NEXT: .LBB19_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-NEXT: .LBB19_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX9-DPP-NEXT: .LBB19_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1064-DPP-NEXT: .LBB19_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1032-DPP-NEXT: .LBB19_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1164-DPP-NEXT: .LBB19_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2 +; GFX1132-DPP-NEXT: .LBB19_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index fde83411843237..4c829f302e0561 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -7510,7 +7510,679 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: .LBB12_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: .LBB12_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} + diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 2cedb3e481ee1f..0b889d9eb0a5d5 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -7510,7 +7510,678 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ret void } +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: .LBB12_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: .LBB12_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: .LBB12_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: .LBB12_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-NEXT: .LBB12_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-NEXT: .LBB12_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-DPP-NEXT: .LBB12_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-DPP-NEXT: .LBB12_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-DPP-NEXT: .LBB12_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-DPP-NEXT: .LBB12_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-DPP-NEXT: .LBB12_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1, !amdgpu.ignore.denormal.mode !1 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory.access !1 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!1 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 4477f028c6d2f2..89abdb2b754a44 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1673,3 +1673,713 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst ret void } + +define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr) { +; VI-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_add_rtn_f32 v0, v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX8-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 + ret float %result +} + +define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr) { +; VI-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 4.0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_add_f32 v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: ds_add_f32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) { +; VI-LABEL: lds_atomic_fadd_ret_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_e32 v4, v3, v1 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_e32 v7, v2, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) { +; VI-LABEL: lds_atomic_fadd_noret_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB15_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_e32 v4, v2, v1 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB15_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v7, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v5, v0, v7, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %val seq_cst + ret void +} + +define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { +; VI-LABEL: lds_atomic_fadd_ret_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v2, v0 +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: .LBB16_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; VI-NEXT: v_add_f32_e32 v2, v2, v3 +; VI-NEXT: v_add_f32_e32 v5, v5, v1 +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v2, v5, v2, 16 +; VI-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB16_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: s_mov_b32 s9, 0x7060302 +; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX9-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v5, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v1, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) { +; VI-LABEL: lds_atomic_fadd_noret_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_read_b32 v3, v0 +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; VI-NEXT: v_add_f32_e32 v4, v4, v2 +; VI-NEXT: v_add_f32_e32 v5, v5, v1 +; VI-NEXT: v_bfe_u32 v6, v4, 16, 1 +; VI-NEXT: v_bfe_u32 v8, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; VI-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; VI-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: s_mov_b32 s9, 0x7060302 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX9-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v4, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v4, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GFX8-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %val seq_cst + ret void +} + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll new file mode 100644 index 00000000000000..da1d23f1496cf3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-zero-size-arr.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; This is an extension and should be rejected by the front-end in most cases. +; If it goes through, lower it as dynlds. + +@Var0 = linkonce_odr hidden local_unnamed_addr addrspace(3) global [0 x float] poison + +define void @fn(float %val, i32 %idx) { +; CHECK-LABEL: define void @fn( +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds [1 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[VAR0]], align 4 +; CHECK-NEXT: [[VAR01:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(3) [[VAR01]], i32 [[IDX]] +; CHECK-NEXT: store float [[VAL]], ptr addrspace(3) [[PTR]], align 4 +; CHECK-NEXT: ret void +; + %ptr = getelementptr i32, ptr addrspace(3) @Var0, i32 %idx + store float %val, ptr addrspace(3) %ptr + ret void +} + +define amdgpu_kernel void @kernelA(float %val, i32 %idx) { +; CHECK-LABEL: define amdgpu_kernel void @kernelA( +; CHECK-SAME: float [[VAL:%.*]], i32 [[IDX:%.*]]) !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernelA.dynlds) ] +; CHECK-NEXT: tail call void @fn(float [[VAL]], i32 [[IDX]]) +; CHECK-NEXT: ret void +; + tail call void @fn(float %val, i32 %idx) + ret void +} +;. +; CHECK: [[META1]] = !{i32 0} +;. diff --git a/llvm/test/CodeGen/DirectX/fmad.ll b/llvm/test/CodeGen/DirectX/fmad.ll index 693e237e70dc02..e1f4e5cd50c4f0 100644 --- a/llvm/test/CodeGen/DirectX/fmad.ll +++ b/llvm/test/CodeGen/DirectX/fmad.ll @@ -21,8 +21,8 @@ entry: %0 = load half, ptr %p0.addr, align 2 %1 = load half, ptr %p1.addr, align 2 %2 = load half, ptr %p2.addr, align 2 - %dx.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2) - ret half %dx.fmad + %hlsl.fmad = call half @llvm.fmuladd.f16(half %0, half %1, half %2) + ret half %hlsl.fmad } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) @@ -40,8 +40,8 @@ entry: %0 = load float, ptr %p0.addr, align 4 %1 = load float, ptr %p1.addr, align 4 %2 = load float, ptr %p2.addr, align 4 - %dx.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) - ret float %dx.fmad + %hlsl.fmad = call float @llvm.fmuladd.f32(float %0, float %1, float %2) + ret float %hlsl.fmad } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) @@ -59,8 +59,8 @@ entry: %0 = load double, ptr %p0.addr, align 8 %1 = load double, ptr %p1.addr, align 8 %2 = load double, ptr %p2.addr, align 8 - %dx.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2) - ret double %dx.fmad + %hlsl.fmad = call double @llvm.fmuladd.f64(double %0, double %1, double %2) + ret double %hlsl.fmad } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll index 7db1048c258cd0..d3fa94779dd714 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll @@ -229,24 +229,24 @@ define signext i32 @test3() { ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: mflr r0 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stdu r1, -48(r1) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r3, L..C0(r2) # target-flags(ppc-tlsldm) @"_$TLSML" -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r8, L..C3(r2) # target-flags(ppc-tlsld) @ElementIntTLSv2 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r0, 64(r1) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r6, 2 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: bla .__tls_get_mod[PR] ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 1 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r5, ElementIntTLS2[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r6, ElementIntTLS3[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r7, ElementIntTLS4[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r9, ElementIntTLS5[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stwux r4, r3, r8 +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r7, ElementIntTLS3[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r6, 320(r5) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r5, 3 +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r6, ElementIntTLS4[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, 324(r7) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r5, L..C3(r2) # target-flags(ppc-tlsld) @ElementIntTLSv2 +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: la r7, ElementIntTLS5[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stwux r4, r3, r5 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 4 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, 24(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 2 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r3, 320(r5) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 3 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r3, 324(r6) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 88 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, 328(r7) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r3, 332(r9) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, 328(r6) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r3, 332(r7) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r0, 16(r1) @@ -258,26 +258,26 @@ define signext i32 @test3() { ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: mflr r0 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stdu r1, -48(r1) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addis r3, L..C0@u(r2) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addis r6, L..C3@u(r2) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r0, 64(r1) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addis r6, L..C3@u(r2) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r3, L..C0@l(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r6, L..C3@l(r6) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r7, 3 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: bla .__tls_get_mod[PR] +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r5, 2 +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r4, ElementIntTLS2[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, 320(r4) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 1 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r5, ElementIntTLS2[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r7, ElementIntTLS3[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r8, ElementIntTLS4[TL]@ld(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r9, ElementIntTLS5[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r5, ElementIntTLS3[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r7, 324(r5) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r5, ElementIntTLS4[TL]@ld(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: la r7, ElementIntTLS5[TL]@ld(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stwux r4, r3, r6 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 4 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, 24(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 2 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r3, 320(r5) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 3 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r3, 324(r7) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, 328(r8) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r3, 332(r9) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, 328(r5) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r3, 332(r7) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r0, 16(r1) @@ -351,12 +351,12 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 9, 3, 9 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 6, 3, 6 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stwux 4, 3, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 2 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 320(6) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 324(7) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 4 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 320(6) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 324(7) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 328(8) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 332(9) @@ -466,12 +466,12 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 9, 3, 9 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 6, 3, 6 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stwux 4, 3, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 2 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 320(6) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 324(7) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 4 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 320(6) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 324(7) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 328(8) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 332(9) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll index 67d82c6908d75e..91013af7a31880 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll @@ -30,11 +30,11 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 3 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr @@ -46,11 +46,11 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr @@ -88,16 +88,16 @@ entry: define signext i32 @StoreArrays2() { ; SMALL-LOCAL-EXEC-SMALLCM64-LABEL: StoreArrays2: ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: add r4, r13, r4 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 0(r4) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 4 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 24(r4) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 2 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 88 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 @@ -106,17 +106,17 @@ define signext i32 @StoreArrays2() { ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 3 +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r3, L..C0@u(r2) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 1 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r3, L..C0@l(r3) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 1 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: add r3, r13, r3 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 0(r3) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 4 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 24(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) @@ -162,35 +162,35 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+15]]) mySmallLocalExecTLSv1[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -460(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+21]]) mySmallLocalExecTLS4[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -32468(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+17]]) mySmallLocalExecTLS2[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -16464(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+19]]) mySmallLocalExecTLS3[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -460(13) -; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+21]]) mySmallLocalExecTLS4[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, 15544(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+23]]) mySmallLocalExecTLS5[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 102 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} blr ; DIS: 0000000000000040 (idx: [[#NFA+5]]) .StoreArrays2: +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -32468(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+17]]) mySmallLocalExecTLS2[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 3, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: [[#NFA+13]]) mySmallLocalExecTLSv2[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 1 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} ld 3, 0(3) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+13]]) mySmallLocalExecTLSv2[TE] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -16464(13) +; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+19]]) mySmallLocalExecTLS3[TL] +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 1 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 13, 3 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 0(3) ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 4, 4 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, 24(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 2 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -32468(13) -; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+17]]) mySmallLocalExecTLS2[TL] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 3 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 3, -16464(13) -; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+19]]) mySmallLocalExecTLS3[TL] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} li 3, 88 ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} stw 4, -460(13) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TLS_LE (idx: [[#NFA+21]]) mySmallLocalExecTLS4[TL] @@ -227,4 +227,4 @@ entry: ; DIS: 000000000000be6c (idx: [[#NFA+19]]) mySmallLocalExecTLS3[TL]: ; DIS: 000000000000fcec (idx: [[#NFA+21]]) mySmallLocalExecTLS4[TL]: ; DIS: 0000000000013b6c (idx: [[#NFA+23]]) mySmallLocalExecTLS5[TL]: -; DIS: 00000000000179ec (idx: [[#NFA+25]]) mySmallLocalExecTLSv2[TL]: \ No newline at end of file +; DIS: 00000000000179ec (idx: [[#NFA+25]]) mySmallLocalExecTLSv2[TL]: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll index 38b35dc6c81cf8..91a2283897f336 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll @@ -64,9 +64,9 @@ define i64 @StoreLargeAccess2() { ; CHECK-SMALLCM64-NEXT: li r3, 55 ; CHECK-SMALLCM64-NEXT: li r4, 64 ; CHECK-SMALLCM64-NEXT: std r3, mySmallTLS2[TL]@le+696(r13) +; CHECK-SMALLCM64-NEXT: add r3, r13, r5 +; CHECK-SMALLCM64-NEXT: std r4, 20000(r3) ; CHECK-SMALLCM64-NEXT: li r3, 142 -; CHECK-SMALLCM64-NEXT: add r5, r13, r5 -; CHECK-SMALLCM64-NEXT: std r4, 20000(r5) ; CHECK-LARGECM64: addis r3, L..C0@u(r2) ; CHECK-LARGECM64-NEXT: li r4, 0 ; CHECK-LARGECM64-NEXT: li r5, 23 @@ -75,8 +75,8 @@ define i64 @StoreLargeAccess2() { ; CHECK-LARGECM64-NEXT: add r3, r13, r3 ; CHECK-LARGECM64-NEXT: stdx r5, r3, r4 ; CHECK-LARGECM64-NEXT: addis r3, L..C1@u(r2) -; CHECK-LARGECM64-NEXT: li r4, 55 ; CHECK-LARGECM64-NEXT: li r5, 64 +; CHECK-LARGECM64-NEXT: li r4, 55 ; CHECK-LARGECM64-NEXT: ld r3, L..C1@l(r3) ; CHECK-LARGECM64-NEXT: std r4, mySmallTLS2[TL]@le+696(r13) ; CHECK-LARGECM64-NEXT: add r3, r13, r3 diff --git a/llvm/test/CodeGen/SPIRV/empty-module.ll b/llvm/test/CodeGen/SPIRV/empty-module.ll index b56e58ccaf8f1b..b7cea042ee5ddc 100644 --- a/llvm/test/CodeGen/SPIRV/empty-module.ll +++ b/llvm/test/CodeGen/SPIRV/empty-module.ll @@ -1,9 +1,15 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-OCL ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-DAG: OpCapability Addresses -; CHECK-DAG: OpCapability Linkage -; CHECK-DAG: OpCapability Kernel -; CHECK: %1 = OpExtInstImport "OpenCL.std" -; CHECK: OpMemoryModel Physical64 OpenCL -; CHECK: OpSource Unknown 0 +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOOCL +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpCapability Linkage +; CHECK-NOOCL-DAG: OpCapability Shader +; CHECK-OCL-DAG: OpCapability Addresses +; CHECK-OCL-DAG: OpCapability Kernel +; CHECK-OCL: %1 = OpExtInstImport "OpenCL.std" +; CHECK-NOOCL: OpMemoryModel Logical GLSL450 +; CHECK-OCL: OpMemoryModel Physical64 OpenCL +; CHECK-NOOCL: OpSource Unknown 0 +; CHECK-OCL: OpSource OpenCL_CPP 100000 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll index a3fec10a9e4bc9..ce9b8f09daead1 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fmad.ll @@ -6,22 +6,22 @@ define noundef half @fmad_half(half noundef %a, half noundef %b, half noundef %c) #0 { entry: ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] Fma %[[#]] %[[#]] %[[#]] - %dx.fmad = call half @llvm.fmuladd.f16(half %a, half %b, half %c) - ret half %dx.fmad + %hlsl.fmad = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %hlsl.fmad } define noundef float @fmad_float(float noundef %a, float noundef %b, float noundef %c) #0 { entry: ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] Fma %[[#]] %[[#]] %[[#]] - %dx.fmad = call float @llvm.fmuladd.f32(float %a, float %b, float %c) - ret float %dx.fmad + %hlsl.fmad = call float @llvm.fmuladd.f32(float %a, float %b, float %c) + ret float %hlsl.fmad } define noundef double @fmad_double(double noundef %a, double noundef %b, double noundef %c) { entry: ; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] Fma %[[#]] %[[#]] %[[#]] - %dx.fmad = call double @llvm.fmuladd.f64(double %a, double %b, double %c) - ret double %dx.fmad + %hlsl.fmad = call double @llvm.fmuladd.f64(double %a, double %b, double %c) + ret double %hlsl.fmad } declare half @llvm.fmuladd.f16(half, half, half) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll new file mode 100644 index 00000000000000..b854412b6ec12a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/imad.ll @@ -0,0 +1,302 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#int_16:]] = OpTypeInt 16 0 +; CHECK-DAG: %[[#vec2_16:]] = OpTypeVector %[[#int_16]] 2 +; CHECK-DAG: %[[#vec3_16:]] = OpTypeVector %[[#int_16]] 3 +; CHECK-DAG: %[[#vec4_16:]] = OpTypeVector %[[#int_16]] 4 +; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#vec2_32:]] = OpTypeVector %[[#int_32]] 2 +; CHECK-DAG: %[[#vec3_32:]] = OpTypeVector %[[#int_32]] 3 +; CHECK-DAG: %[[#vec4_32:]] = OpTypeVector %[[#int_32]] 4 +; CHECK-DAG: %[[#int_64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#vec2_64:]] = OpTypeVector %[[#int_64]] 2 +; CHECK-DAG: %[[#vec3_64:]] = OpTypeVector %[[#int_64]] 3 +; CHECK-DAG: %[[#vec4_64:]] = OpTypeVector %[[#int_64]] 4 + +define spir_func noundef i16 @test_mad_uint16_t(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_16]] %[[#mul]] %[[#arg2]] + %3 = mul nuw i16 %p0, %p1 + %4 = add nuw i16 %3, %p2 + ret i16 %4 +} + +define spir_func noundef <2 x i16> @test_mad_uint16_t2(<2 x i16> noundef %p0, <2 x i16> noundef %p1, <2 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_16]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <2 x i16> %p0, %p1 + %4 = add nuw <2 x i16> %3, %p2 + ret <2 x i16> %4 +} + +define spir_func noundef <3 x i16> @test_mad_uint16_t3(<3 x i16> noundef %p0, <3 x i16> noundef %p1, <3 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_16]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <3 x i16> %p0, %p1 + %4 = add nuw <3 x i16> %3, %p2 + ret <3 x i16> %4 +} + +define spir_func noundef <4 x i16> @test_mad_uint16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_16]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <4 x i16> %p0, %p1 + %4 = add nuw <4 x i16> %3, %p2 + ret <4 x i16> %4 +} + +define spir_func noundef i16 @test_mad_int16_t(i16 noundef %p0, i16 noundef %p1, i16 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_16]] %[[#mul]] %[[#arg2]] + %3 = mul nsw i16 %p0, %p1 + %4 = add nsw i16 %3, %p2 + ret i16 %4 +} + +define spir_func noundef <2 x i16> @test_mad_int16_t2(<2 x i16> noundef %p0, <2 x i16> noundef %p1, <2 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_16]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <2 x i16> %p0, %p1 + %4 = add nsw <2 x i16> %3, %p2 + ret <2 x i16> %4 +} + +define spir_func noundef <3 x i16> @test_mad_int16_t3(<3 x i16> noundef %p0, <3 x i16> noundef %p1, <3 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_16]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <3 x i16> %p0, %p1 + %4 = add nsw <3 x i16> %3, %p2 + ret <3 x i16> %4 +} + +define spir_func noundef <4 x i16> @test_mad_int16_t4(<4 x i16> noundef %p0, <4 x i16> noundef %p1, <4 x i16> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_16]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_16]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <4 x i16> %p0, %p1 + %4 = add nsw <4 x i16> %3, %p2 + ret <4 x i16> %4 +} +define spir_func noundef i32 @test_mad_int(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_32]] %[[#mul]] %[[#arg2]] + %3 = mul nsw i32 %p0, %p1 + %4 = add nsw i32 %3, %p2 + ret i32 %4 +} + +define spir_func noundef <2 x i32> @test_mad_int2(<2 x i32> noundef %p0, <2 x i32> noundef %p1, <2 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_32]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <2 x i32> %p0, %p1 + %4 = add nsw <2 x i32> %3, %p2 + ret <2 x i32> %4 +} + +define spir_func noundef <3 x i32> @test_mad_int3(<3 x i32> noundef %p0, <3 x i32> noundef %p1, <3 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_32]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <3 x i32> %p0, %p1 + %4 = add nsw <3 x i32> %3, %p2 + ret <3 x i32> %4 +} + +define spir_func noundef <4 x i32> @test_mad_int4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_32]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <4 x i32> %p0, %p1 + %4 = add nsw <4 x i32> %3, %p2 + ret <4 x i32> %4 +} + +define spir_func noundef i64 @test_mad_int64_t(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_64]] %[[#mul]] %[[#arg2]] + %3 = mul nsw i64 %p0, %p1 + %4 = add nsw i64 %3, %p2 + ret i64 %4 +} + +define spir_func noundef <2 x i64> @test_mad_int64_t2(<2 x i64> noundef %p0, <2 x i64> noundef %p1, <2 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_64]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <2 x i64> %p0, %p1 + %4 = add nsw <2 x i64> %3, %p2 + ret <2 x i64> %4 +} + +define spir_func noundef <3 x i64> @test_mad_int64_t3(<3 x i64> noundef %p0, <3 x i64> noundef %p1, <3 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_64]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <3 x i64> %p0, %p1 + %4 = add nsw <3 x i64> %3, %p2 + ret <3 x i64> %4 +} + +define spir_func noundef <4 x i64> @test_mad_int64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_64]] %[[#mul]] %[[#arg2]] + %3 = mul nsw <4 x i64> %p0, %p1 + %4 = add nsw <4 x i64> %3, %p2 + ret <4 x i64> %4 +} + +define spir_func noundef i32 @test_mad_uint(i32 noundef %p0, i32 noundef %p1, i32 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_32]] %[[#mul]] %[[#arg2]] + %3 = mul nuw i32 %p0, %p1 + %4 = add nuw i32 %3, %p2 + ret i32 %4 +} + +define spir_func noundef <2 x i32> @test_mad_uint2(<2 x i32> noundef %p0, <2 x i32> noundef %p1, <2 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_32]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <2 x i32> %p0, %p1 + %4 = add nuw <2 x i32> %3, %p2 + ret <2 x i32> %4 +} + +define spir_func noundef <3 x i32> @test_mad_uint3(<3 x i32> noundef %p0, <3 x i32> noundef %p1, <3 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_32]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <3 x i32> %p0, %p1 + %4 = add nuw <3 x i32> %3, %p2 + ret <3 x i32> %4 +} + +define spir_func noundef <4 x i32> @test_mad_uint4(<4 x i32> noundef %p0, <4 x i32> noundef %p1, <4 x i32> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_32]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_32]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <4 x i32> %p0, %p1 + %4 = add nuw <4 x i32> %3, %p2 + ret <4 x i32> %4 +} + +define spir_func noundef i64 @test_mad_uint64_t(i64 noundef %p0, i64 noundef %p1, i64 noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#int_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#int_64]] %[[#mul]] %[[#arg2]] + %3 = mul nuw i64 %p0, %p1 + %4 = add nuw i64 %3, %p2 + ret i64 %4 +} + +define spir_func noundef <2 x i64> @test_mad_uint64_t2(<2 x i64> noundef %p0, <2 x i64> noundef %p1, <2 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec2_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec2_64]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <2 x i64> %p0, %p1 + %4 = add nuw <2 x i64> %3, %p2 + ret <2 x i64> %4 +} + +define spir_func noundef <3 x i64> @test_mad_uint64_t3(<3 x i64> noundef %p0, <3 x i64> noundef %p1, <3 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec3_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec3_64]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <3 x i64> %p0, %p1 + %4 = add nuw <3 x i64> %3, %p2 + ret <3 x i64> %4 +} + +define spir_func noundef <4 x i64> @test_mad_uint64_t4(<4 x i64> noundef %p0, <4 x i64> noundef %p1, <4 x i64> noundef %p2) { +entry: + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]] + ; CHECK: %[[#mul:]] = OpIMul %[[#vec4_64]] %[[#arg0]] %[[#arg1]] + ; CHECK: OpIAdd %[[#vec4_64]] %[[#mul]] %[[#arg2]] + %3 = mul nuw <4 x i64> %p0, %p1 + %4 = add nuw <4 x i64> %3, %p2 + ret <4 x i64> %4 +} diff --git a/llvm/test/CodeGen/SystemZ/zos-frameaddr.ll b/llvm/test/CodeGen/SystemZ/zos-frameaddr.ll new file mode 100644 index 00000000000000..597731780676e9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/zos-frameaddr.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=s390x-ibm-zos | FileCheck --check-prefix=CHECK %s + +; The current function's frame address is the address of +; the optional back chain slot. +define ptr @fp0() nounwind { +; CHECK-LABEL: fp0: +; CHECK: la 3, 2048(4) +; CHECK-NEXT: b 2(7) +entry: + %0 = tail call ptr @llvm.frameaddress(i32 0) + ret ptr %0 +} + +; Check that the frame address is correct in a presence +; of a stack frame. +define ptr @fp0f() nounwind { +; CHECK-LABEL: fp0f: +; CHECK: stmg 6, 7, 1904(4) +; CHECK-NEXT: aghi 4, -160 +; CHECK-NEXT: la 3, 2048(4) +; CHECK-NEXT: lg 7, 2072(4) +; CHECK-NEXT: aghi 4, 160 +; CHECK-NEXT: b 2(7) +entry: + %0 = alloca i64, align 8 + %1 = tail call ptr @llvm.frameaddress(i32 0) + ret ptr %1 +} + +; Check the caller's frame address. +define ptr @fpcaller() nounwind "backchain" { +; CHECK-LABEL: fpcaller: +; CHECK: stmg 4, 7, 2048(4) +; CHECK-NEXT: lg 3, 2048(4) +; CHECK-NEXT: lmg 4, 7, 2048(4) +; CHECK-NEXT: b 2(7) +entry: + %0 = tail call ptr @llvm.frameaddress(i32 1) + ret ptr %0 +} + +; Check the caller's frame address. +define ptr @fpcallercaller() nounwind "backchain" { +; CHECK-LABEL: fpcallercaller: +; CHECK: stmg 4, 7, 2048(4) +; CHECK-NEXT: lg 1, 2048(4) +; CHECK-NEXT: lg 3, 0(1) +; CHECK-NEXT: lmg 4, 7, 2048(4) +; CHECK-NEXT: b 2(7) +entry: + %0 = tail call ptr @llvm.frameaddress(i32 2) + ret ptr %0 +} + +declare ptr @llvm.frameaddress(i32) nounwind readnone diff --git a/llvm/test/CodeGen/X86/threadlocal_address.ll b/llvm/test/CodeGen/X86/threadlocal_address.ll index 6c6649b6419c34..52ff413e6988f8 100644 --- a/llvm/test/CodeGen/X86/threadlocal_address.ll +++ b/llvm/test/CodeGen/X86/threadlocal_address.ll @@ -4,7 +4,7 @@ define noundef i32 @foo() { ; CHECK: %0:gr64 = MOV64rm $rip, 1, $noreg, target-flags(x86-gottpoff) @i, $noreg :: (load (s64) from got) -; CHECK: %1:gr32 = MOV32rm %0, 1, $noreg, 0, $fs :: (load (s32) from %ir.0) +; CHECK: %1:gr32 = MOV32rm %0, 1, $noreg, 0, $fs :: (dereferenceable load (s32) from %ir.0) ; CHECK: %2:gr32 = nsw INC32r %1, implicit-def dead $eflags ; CHECK: MOV32mr %0, 1, $noreg, 0, $fs, %2 :: (store (s32) into %ir.0) ; CHECK: $eax = COPY %2 @@ -22,7 +22,7 @@ entry: @j = thread_local addrspace(1) global ptr addrspace(0) @i, align 4 define noundef i32 @bar() { ; CHECK: %0:gr64 = MOV64rm $rip, 1, $noreg, target-flags(x86-gottpoff) @j, $noreg :: (load (s64) from got) -; CHECK: %1:gr32 = MOV32rm %0, 1, $noreg, 0, $fs :: (load (s32) from %ir.0, addrspace 1) +; CHECK: %1:gr32 = MOV32rm %0, 1, $noreg, 0, $fs :: (dereferenceable load (s32) from %ir.0, addrspace 1) ; CHECK: %2:gr32 = nsw INC32r %1, implicit-def dead $eflags ; CHECK: MOV32mr %0, 1, $noreg, 0, $fs, %2 :: (store (s32) into %ir.0, addrspace 1) ; CHECK: $eax = COPY %2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 1addedf3c3d960..0459d47eed8178 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2453,9 +2453,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index ebcb1cb15a600e..e81b9adfdd3e3d 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2344,17 +2344,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 638a3cdaa2c1d2..b839452725a95f 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2462,9 +2462,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOP-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 3fabf720da71c3..7b6b0ea83c7eea 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2145,17 +2145,15 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 8c8b7cc5161056..7f4006792ef1a8 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -627,24 +627,23 @@ define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_with_undef(<4 x i32> %x, <4 x i32> %y) { ; X86-LABEL: vec_add_of_not_with_undef: ; X86: # %bb.0: -; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-NEXT: pxor %xmm1, %xmm2 +; X86-NEXT: paddd %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LIN-LABEL: vec_add_of_not_with_undef: ; X64-LIN: # %bb.0: -; X64-LIN-NEXT: psubd %xmm1, %xmm0 -; X64-LIN-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-LIN-NEXT: paddd %xmm1, %xmm0 +; X64-LIN-NEXT: pcmpeqd %xmm2, %xmm2 +; X64-LIN-NEXT: pxor %xmm1, %xmm2 +; X64-LIN-NEXT: paddd %xmm2, %xmm0 ; X64-LIN-NEXT: retq ; ; X64-WIN-LABEL: vec_add_of_not_with_undef: ; X64-WIN: # %bb.0: -; X64-WIN-NEXT: movdqa (%rcx), %xmm1 -; X64-WIN-NEXT: psubd (%rdx), %xmm1 ; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0 -; X64-WIN-NEXT: paddd %xmm1, %xmm0 +; X64-WIN-NEXT: pxor (%rdx), %xmm0 +; X64-WIN-NEXT: paddd (%rcx), %xmm0 ; X64-WIN-NEXT: retq %t0 = sub <4 x i32> %x, %y %r = add <4 x i32> %t0, @@ -654,26 +653,25 @@ define <4 x i32> @vec_add_of_not_with_undef(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_with_undef_decrement(<4 x i32> %x, <4 x i32> %y) { ; X86-LABEL: vec_add_of_not_with_undef_decrement: ; X86: # %bb.0: -; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: pcmpeqd %xmm1, %xmm1 -; X86-NEXT: psubd %xmm1, %xmm0 +; X86-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-NEXT: pxor %xmm1, %xmm2 +; X86-NEXT: paddd %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LIN-LABEL: vec_add_of_not_with_undef_decrement: ; X64-LIN: # %bb.0: -; X64-LIN-NEXT: psubd %xmm1, %xmm0 -; X64-LIN-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-LIN-NEXT: psubd %xmm1, %xmm0 +; X64-LIN-NEXT: pcmpeqd %xmm2, %xmm2 +; X64-LIN-NEXT: pxor %xmm1, %xmm2 +; X64-LIN-NEXT: paddd %xmm2, %xmm0 ; X64-LIN-NEXT: retq ; ; X64-WIN-LABEL: vec_add_of_not_with_undef_decrement: ; X64-WIN: # %bb.0: -; X64-WIN-NEXT: movdqa (%rcx), %xmm0 -; X64-WIN-NEXT: psubd (%rdx), %xmm0 -; X64-WIN-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-WIN-NEXT: psubd %xmm1, %xmm0 +; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0 +; X64-WIN-NEXT: pxor (%rdx), %xmm0 +; X64-WIN-NEXT: paddd (%rcx), %xmm0 ; X64-WIN-NEXT: retq %t0 = sub <4 x i32> %x, %y - %r = add <4 x i32> %t0, + %r = sub <4 x i32> %t0, ret <4 x i32> %r } diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll index 6d40e2b7eaf9c5..62c0a24c60d77f 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll @@ -580,5 +580,5 @@ ehcleanup: ; preds = %entry cleanupret from %0 unwind to caller } ;. -; CHECK-INLINE: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000} +; CHECK-INLINE: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575} ;. diff --git a/llvm/test/Instrumentation/AddressSanitizer/basic.ll b/llvm/test/Instrumentation/AddressSanitizer/basic.ll index 2064db5a0918fc..1aef8c03c9d3e5 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/basic.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/basic.ll @@ -218,4 +218,4 @@ define void @test_swifterror_3() sanitize_address { ; CHECK: attributes #[[#ATTR]] = { nounwind } ; PROF -; CHECK: ![[PROF]] = !{!"branch_weights", i32 1, i32 100000} +; CHECK: ![[PROF]] = !{!"branch_weights", i32 1, i32 1048575} diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll index be8b6c8b95ce99..a6d7ae67d8453c 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll @@ -1,23 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 ; Test -sanitizer-coverage-inline-bool-flag=1 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-bool-flag=1 -S | FileCheck %s -; CHECK: $foo = comdat nodeduplicate -; CHECK: @__sancov_gen_ = private global [1 x i1] zeroinitializer, section "__sancov_bools", comdat($foo), align 1{{$}} -; CHECK: @__start___sancov_bools = extern_weak hidden global i1 -; CHECK-NEXT: @__stop___sancov_bools = extern_weak hidden global i1 -; CHECK: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_bool_flag], section "llvm.metadata" -; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @__sancov_gen_], section "llvm.metadata" target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" +;. +; CHECK: @__sancov_lowest_stack = external thread_local(initialexec) global i64 +; CHECK: @__sancov_gen_ = private global [1 x i1] zeroinitializer, section "__sancov_bools", comdat($foo), align 1 +; CHECK: @__start___sancov_bools = extern_weak hidden global i1 +; CHECK: @__stop___sancov_bools = extern_weak hidden global i1 +; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @sancov.module_ctor_bool_flag, ptr @sancov.module_ctor_bool_flag }] +; CHECK: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_bool_flag], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr @__sancov_gen_], section "llvm.metadata" +;. define void @foo() { -; CHECK-LABEL: @foo( +; CHECK-LABEL: define void @foo() comdat { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i1, ptr @__sancov_gen_, align 1, !nosanitize ![[#EMPTY:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i1, ptr @__sancov_gen_, align 1, !nosanitize [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i1 [[TMP0]], false -; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: 2: -; CHECK-NEXT: store i1 true, ptr @__sancov_gen_, align 1, !nosanitize ![[#EMPTY:]] +; CHECK-NEXT: store i1 true, ptr @__sancov_gen_, align 1, !nosanitize [[META0]] ; CHECK-NEXT: br label [[TMP3]] ; CHECK: 3: ; CHECK-NEXT: ret void @@ -25,6 +29,10 @@ define void @foo() { entry: ret void } -; CHECK: call void @__sanitizer_cov_bool_flag_init(ptr @__start___sancov_bools, ptr @__stop___sancov_bools) -; CHECK: ![[#EMPTY]] = !{} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } +;. +; CHECK: [[META0]] = !{} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll index 0e5eaae96d731a..00547afd1b9ef2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth.ll @@ -1,43 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 ; This check verifies that stack depth instrumentation works correctly. -; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \ -; RUN: -sanitizer-coverage-stack-depth -S | FileCheck %s -; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 \ -; RUN: -sanitizer-coverage-stack-depth -sanitizer-coverage-trace-pc-guard \ -; RUN: -S | FileCheck %s +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S | FileCheck %s --check-prefixes=L1 +; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=3 -sanitizer-coverage-stack-depth -S -sanitizer-coverage-trace-pc-guard | FileCheck %s --check-prefixes=L3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; CHECK: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1 @__sancov_lowest_stack = thread_local global i64 0, align 8 +;. +; L1: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1, align 8 +;. +; L3: @__sancov_lowest_stack = thread_local(initialexec) global i64 -1, align 8 +; L3: @__sancov_gen_ = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($foo), align 4 +; L3: @__sancov_gen_.1 = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($bar), align 4 +; L3: @__sancov_gen_.2 = private global [1 x i32] zeroinitializer, section "__sancov_guards", comdat($_ZTW21__sancov_lowest_stack), align 4 +; L3: @__start___sancov_guards = extern_weak hidden global i32 +; L3: @__stop___sancov_guards = extern_weak hidden global i32 +; L3: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @sancov.module_ctor_trace_pc_guard, ptr @sancov.module_ctor_trace_pc_guard }] +; L3: @llvm.used = appending global [1 x ptr] [ptr @sancov.module_ctor_trace_pc_guard], section "llvm.metadata" +; L3: @llvm.compiler.used = appending global [3 x ptr] [ptr @__sancov_gen_, ptr @__sancov_gen_.1, ptr @__sancov_gen_.2], section "llvm.metadata" +;. define i32 @foo() { +; L1-LABEL: define i32 @foo() { +; L1-NEXT: entry: +; L1-NEXT: ret i32 7 +; +; L3-LABEL: define i32 @foo() comdat { +; L3-NEXT: entry: +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR2:[0-9]+]] +; L3-NEXT: ret i32 7 +; entry: -; CHECK-LABEL: define i32 @foo -; CHECK-NOT: call ptr @llvm.frameaddress.p0(i32 0) -; CHECK-NOT: @__sancov_lowest_stack -; CHECK: ret i32 7 ret i32 7 } define i32 @bar() { +; L1-LABEL: define i32 @bar() { +; L1-NEXT: entry: +; L1-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; L1-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; L1-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] +; L1-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] +; L1-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]] +; L1: 4: +; L1-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] +; L1-NEXT: br label [[TMP5]] +; L1: 5: +; L1-NEXT: [[CALL:%.*]] = call i32 @foo() +; L1-NEXT: ret i32 [[CALL]] +; +; L3-LABEL: define i32 @bar() comdat { +; L3-NEXT: entry: +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_.1) #[[ATTR2]] +; L3-NEXT: [[TMP0:%.*]] = call ptr @llvm.frameaddress.p0(i32 0) +; L3-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; L3-NEXT: [[TMP2:%.*]] = load i64, ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0:![0-9]+]] +; L3-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] +; L3-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1:![0-9]+]] +; L3: 4: +; L3-NEXT: store i64 [[TMP1]], ptr @__sancov_lowest_stack, align 8, !nosanitize [[META0]] +; L3-NEXT: br label [[TMP5]] +; L3: 5: +; L3-NEXT: [[CALL:%.*]] = call i32 @foo() +; L3-NEXT: ret i32 [[CALL]] +; entry: -; CHECK-LABEL: define i32 @bar -; CHECK: [[framePtr:%[^ \t]+]] = call ptr @llvm.frameaddress.p0(i32 0) -; CHECK: [[frameInt:%[^ \t]+]] = ptrtoint ptr [[framePtr]] to [[intType:i[0-9]+]] -; CHECK: [[lowest:%[^ \t]+]] = load [[intType]], ptr @__sancov_lowest_stack -; CHECK: [[cmp:%[^ \t]+]] = icmp ult [[intType]] [[frameInt]], [[lowest]] -; CHECK: br i1 [[cmp]], label %[[ifLabel:[^ \t]+]], label -; CHECK: [[ifLabel]]: -; CHECK: store [[intType]] [[frameInt]], ptr @__sancov_lowest_stack -; CHECK: %call = call i32 @foo() -; CHECK: ret i32 %call %call = call i32 @foo() ret i32 %call } define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() { +; L1-LABEL: define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() { +; L1-NEXT: ret ptr @__sancov_lowest_stack +; +; L3-LABEL: define weak_odr hidden ptr @_ZTW21__sancov_lowest_stack() comdat { +; L3-NEXT: call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_.2) #[[ATTR2]] +; L3-NEXT: ret ptr @__sancov_lowest_stack +; ret ptr @__sancov_lowest_stack } +;. +; L1: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; L3: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; L3: attributes #[[ATTR1:[0-9]+]] = { nounwind } +; L3: attributes #[[ATTR2]] = { nomerge } +;. +; L1: [[META0]] = !{} +; L1: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. +; L3: [[META0]] = !{} +; L3: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/MC/Disassembler/SystemZ/insns-z14.txt b/llvm/test/MC/Disassembler/SystemZ/insns-z14.txt index c73b50c1c2fbdc..f1657bdea84d17 100644 --- a/llvm/test/MC/Disassembler/SystemZ/insns-z14.txt +++ b/llvm/test/MC/Disassembler/SystemZ/insns-z14.txt @@ -3251,3 +3251,11 @@ # CHECK: wftcixb %v4, %v21, 1656 0xe7 0x45 0x67 0x88 0x44 0x4a +# CHECK: tpei %r0, %r15 +0xb9 0xa1 0x00 0x0f + +# CHECK: tpei %r15, %r0 +0xb9 0xa1 0x00 0xf0 + +# CHECK: tpei %r4, %r10 +0xb9 0xa1 0x00 0x4a diff --git a/llvm/test/MC/SystemZ/insn-bad-z13.s b/llvm/test/MC/SystemZ/insn-bad-z13.s index 4ae4fd99fbd4dc..46aebc75e4fabd 100644 --- a/llvm/test/MC/SystemZ/insn-bad-z13.s +++ b/llvm/test/MC/SystemZ/insn-bad-z13.s @@ -3024,3 +3024,6 @@ wledb %v0, %v0, -1, 0 wledb %v0, %v0, 16, 0 + #CHECK: error: instruction requires: test-pending-external-interrupt + #CHECK: tpei %r0, %r1 + tpei %r0, %r1 diff --git a/llvm/test/MC/SystemZ/insn-good-z14.s b/llvm/test/MC/SystemZ/insn-good-z14.s index ec12283ecbae89..385fd3ce7d4267 100644 --- a/llvm/test/MC/SystemZ/insn-good-z14.s +++ b/llvm/test/MC/SystemZ/insn-good-z14.s @@ -2720,3 +2720,9 @@ wftcixb %v31, %v0, 0 wftcixb %v4, %v21, 0x678 +#CHECK: tpei %r0, %r15 # encoding: [0xb9,0xa1,0x00,0x0f] +#CHECK: tpei %r15, %r0 # encoding: [0xb9,0xa1,0x00,0xf0] +#CHECK: tpei %r4, %r10 # encoding: [0xb9,0xa1,0x00,0x4a] + tpei %r0, %r15 + tpei %r15, %r0 + tpei %r4, %r10 diff --git a/llvm/test/MachineVerifier/verify-inlineasmbr.mir b/llvm/test/MachineVerifier/verify-inlineasmbr.mir index fd3cbc5f8b2394..fe54379c1bc763 100644 --- a/llvm/test/MachineVerifier/verify-inlineasmbr.mir +++ b/llvm/test/MachineVerifier/verify-inlineasmbr.mir @@ -1,4 +1,4 @@ -# RUN: not --crash llc -run-pass=none -verify-machineinstrs %s 2>&1 \ +# RUN: not --crash llc -run-pass=none -verify-machineinstrs %s -o /dev/null 2>&1 \ # RUN: | FileCheck %s # REQUIRES: powerpc-registered-target diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll new file mode 100644 index 00000000000000..8bbbcd16cb1afc --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll @@ -0,0 +1,3717 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4 +; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fadd_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret float [[TMP5]] +; +; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RES]] +; +; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret float [[TMP5]] +; +; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; +; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd (no return) +;--------------------------------------------------------------------- + +define void @test_atomicrmw_fadd_noret_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f32_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 { +; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret void +; +; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret void +; +; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret void +; +; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; +; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret void +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fsub_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) { +; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret float [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret float %res +} + +attributes #0 = { "denormal-fp-mode-f32"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" } + +!0 = !{} +;. +; GFX940: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll new file mode 100644 index 00000000000000..e1890da15b0c64 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll @@ -0,0 +1,1685 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64 +; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8 +; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fadd_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8 +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 { +; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret double [[TMP5]] +; +; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; GFX940-NEXT: ret double [[RES]] +; +; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret double [[TMP5]] +; +; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; +; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_f64_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX12: atomicrmw.start: +; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX12: atomicrmw.end: +; GFX12-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 { +; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP5:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP1]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP1]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP1]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP5]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[NEWLOADED]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fsub_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP2:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: store double [[NEW]], ptr addrspace(5) [[TMP2]], align 8 +; COMMON-NEXT: [[TMP9:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP4]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP2]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP2]]) +; COMMON-NEXT: [[TMP6:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP7:%.*]] = insertvalue { double, i1 } poison, double [[TMP6]], 0 +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } [[TMP7]], i1 [[TMP9]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP8]], 1 +; COMMON-NEXT: [[TMP5]] = extractvalue { double, i1 } [[TMP8]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) { +; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP3:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP4:%.*]] = alloca double, align 8, addrspace(5) +; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]]) +; COMMON-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: store double [[LOADED]], ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: store double [[TMP2]], ptr addrspace(5) [[TMP4]], align 8 +; COMMON-NEXT: [[TMP10:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 8, ptr [[TMP5]], ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP4]], i32 5, i32 5) +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP4]]) +; COMMON-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(5) [[TMP3]], align 8 +; COMMON-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[TMP3]]) +; COMMON-NEXT: [[TMP8:%.*]] = insertvalue { double, i1 } poison, double [[TMP7]], 0 +; COMMON-NEXT: [[TMP9:%.*]] = insertvalue { double, i1 } [[TMP8]], i1 [[TMP10]], 1 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { double, i1 } [[TMP9]], 1 +; COMMON-NEXT: [[TMP6]] = extractvalue { double, i1 } [[TMP9]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret double [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret double %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;. +; GFX940: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll new file mode 100644 index 00000000000000..b711b9fe4edf08 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll @@ -0,0 +1,828 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define i32 @test_atomicrmw_xchg_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xchg_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw add +;--------------------------------------------------------------------- + +; add is supported over PCIe, so no expansion is necessary +define i32 @test_atomicrmw_add_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw sub +;--------------------------------------------------------------------- + +; expansion is necessary, sub is not supported over PCIe +define i32 @test_atomicrmw_sub_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw and +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_and_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw nand +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported +define i32 @test_atomicrmw_nand_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw or +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_or_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw xor +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_xor_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw max +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_max_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_max_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw min +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_min_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_min_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umax +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_umax_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umax_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umin +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_umin_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[LOADED]], i32 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw uinc_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_uinc_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +;--------------------------------------------------------------------- +; atomicrmw udec_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i32 @test_atomicrmw_udec_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i32 %value) { +; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +!0 = !{} +;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll new file mode 100644 index 00000000000000..d67bf2e450b8bc --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll @@ -0,0 +1,828 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; atomicrmw xchg +;--------------------------------------------------------------------- + +; xchg is supported over PCIe, so no expansion is necessary +define i64 @test_atomicrmw_xchg_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xchg_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw xchg ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw add +;--------------------------------------------------------------------- + +; add is supported over PCIe, so no expansion is necessary +define i64 @test_atomicrmw_add_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +; add is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[RES]] +; + %res = atomicrmw add ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw sub +;--------------------------------------------------------------------- + +; expansion is necessary, sub is not supported over PCIe +define i64 @test_atomicrmw_sub_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw and +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_and_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw nand +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported +define i64 @test_atomicrmw_nand_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[TMP2]], -1 +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw nand ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw or +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_or_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw xor +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_xor_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw max +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_max_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_max_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw max ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw min +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_min_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_min_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp sle i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw min ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umax +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_umax_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umax_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umax ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw umin +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_umin_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = icmp ule i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[LOADED]], i64 [[VALUE]] +; COMMON-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw umin ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw uinc_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_uinc_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +;--------------------------------------------------------------------- +; atomicrmw udec_wrap +;--------------------------------------------------------------------- + +; expansion is necessary, operation not supported over PCIe +define i64 @test_atomicrmw_udec_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, i64 %value) { +; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; COMMON-NEXT: ret i64 [[NEWLOADED]] +; + %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i64 %res +} + +!0 = !{} +;. +; GFX803: [[META0]] = !{} +;. +; GFX906: [[META0]] = !{} +;. +; GFX908: [[META0]] = !{} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX10: [[META0]] = !{} +;. +; GFX11: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 9010dec31c1610..337e51f9a912c2 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=CI %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GFX9 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefix=GFX908 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefix=GFX90A %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefix=GFX940 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefix=GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,CI %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX11 %s define void @test_atomicrmw_fadd_f32_global_no_use_unsafe(ptr addrspace(1) %ptr, float %value) #0 { ; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe( @@ -1179,698 +1179,133 @@ define float @test_atomicrmw_fadd_f32_local(ptr addrspace(3) %ptr, float %value) } define half @test_atomicrmw_fadd_f16_flat(ptr %ptr, half %value) { -; CI-LABEL: @test_atomicrmw_fadd_f16_flat( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; CI-NEXT: br label [[ATOMICRMW_START:%.*]] -; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; CI-NEXT: ret half [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX9-NEXT: ret half [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX908-NEXT: ret half [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX90A-NEXT: ret half [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX940-NEXT: ret half [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_f16_flat( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX11-NEXT: ret half [[TMP7]] +; ALL-LABEL: @test_atomicrmw_fadd_f16_flat( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half +; ALL-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd ptr %ptr, half %value seq_cst ret half %res } define half @test_atomicrmw_fadd_f16_global(ptr addrspace(1) %ptr, half %value) { -; CI-LABEL: @test_atomicrmw_fadd_f16_global( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; CI-NEXT: br label [[ATOMICRMW_START:%.*]] -; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; CI-NEXT: ret half [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX9-NEXT: ret half [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX908-NEXT: ret half [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX90A-NEXT: ret half [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX940-NEXT: ret half [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_f16_global( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX11-NEXT: ret half [[TMP7]] +; ALL-LABEL: @test_atomicrmw_fadd_f16_global( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half +; ALL-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, half %value seq_cst ret half %res } define half @test_atomicrmw_fadd_f16_global_align4(ptr addrspace(1) %ptr, half %value) { -; CI-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; CI-NEXT: br label [[ATOMICRMW_START:%.*]] -; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; CI-NEXT: ret half [[TMP5]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; GFX9-NEXT: ret half [[TMP5]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; GFX908-NEXT: ret half [[TMP5]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; GFX90A-NEXT: ret half [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; GFX940-NEXT: ret half [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_f16_global_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half -; GFX11-NEXT: ret half [[TMP5]] +; ALL-LABEL: @test_atomicrmw_fadd_f16_global_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; ALL-NEXT: ret half [[TMP5]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, half %value seq_cst, align 4 ret half %res } define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) { -; CI-LABEL: @test_atomicrmw_fadd_f16_local( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; CI-NEXT: br label [[ATOMICRMW_START:%.*]] -; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; CI-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; CI-NEXT: ret half [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX9-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX9-NEXT: ret half [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX908-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX908-NEXT: ret half [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX90A-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX90A-NEXT: ret half [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX940-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX940-NEXT: ret half [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_f16_local( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half -; GFX11-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half -; GFX11-NEXT: ret half [[TMP7]] +; ALL-LABEL: @test_atomicrmw_fadd_f16_local( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half +; ALL-NEXT: ret half [[TMP7]] ; %res = atomicrmw fadd ptr addrspace(3) %ptr, half %value seq_cst ret half %res @@ -2315,7 +1750,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -2331,7 +1766,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -2368,7 +1803,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -2384,7 +1819,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -2400,7 +1835,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -2424,7 +1859,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6:[0-9]+]] +; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -2445,7 +1880,7 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 @@ -2481,1794 +1916,2540 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa } define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_local( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; ALL-LABEL: @test_atomicrmw_fadd_bf16_local( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_local_align4(ptr addrspace(3) %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic, align 4 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_agent( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_global_agent_align4(ptr addrspace(1) %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic, align 4 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_global_system(ptr addrspace(1) %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_system( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic, align 4 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR9:[0-9]+]] +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 +; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_flat_agent(ptr %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_flat_agent_align4(ptr %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic, align 4 + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_flat_system(ptr %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_system( +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 +; ALL-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 +; ALL-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic + ret bfloat %res +} + +define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %value) { +; ALL-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 4 + ret bfloat %res +} + +define void @test_atomicrmw_fadd_f32_global_system_noret(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret void } -define bfloat @test_atomicrmw_fadd_bf16_local_align4(ptr addrspace(3) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +define float @test_atomicrmw_fadd_f32_global_system_ret(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; CI-NEXT: ret bfloat [[TMP5]] +; CI-NEXT: ret float [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP5]] +; GFX9-NEXT: ret float [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP5]] +; GFX908-NEXT: ret float [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP5]] +; GFX11-NEXT: ret float [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic, align 4 - ret bfloat %res + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret float %ret } -define bfloat @test_atomicrmw_fadd_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +define void @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_agent( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @test_atomicrmw_fadd_bf16_global_agent_align4(ptr addrspace(1) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +define float @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; CI-NEXT: ret bfloat [[TMP5]] +; CI-NEXT: ret float [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP5]] +; GFX9-NEXT: ret float [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP5]] +; GFX908-NEXT: ret float [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_agent_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP5]] +; GFX11-NEXT: ret float [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value syncscope("agent") monotonic, align 4 - ret bfloat %res + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret } -define bfloat @test_atomicrmw_fadd_bf16_global_system(ptr addrspace(1) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +define void @test_atomicrmw_fadd_f32_daz_global_system_noret(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_system( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret void } -define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +define float @test_atomicrmw_fadd_f32_daz_global_system_ret(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; CI-NEXT: ret bfloat [[TMP5]] +; CI-NEXT: ret float [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP5]] +; GFX9-NEXT: ret float [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP5]] +; GFX908-NEXT: ret float [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_global_system_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP5]] +; GFX11-NEXT: ret float [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %value monotonic, align 4 - ret bfloat %res + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic + ret float %ret } -define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { -; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +define void @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]] -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]] -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, bfloat %value monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @test_atomicrmw_fadd_bf16_flat_agent(ptr %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +define float @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #3 { +; CI-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_daz_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #4 { +; CI-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) #4 { +; CI-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret float [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret float [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_local_noret(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX908-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_noret( +; GFX11-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_local_ret(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX9-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX9-NEXT: ret float [[RET]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX908-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX908-NEXT: ret float [[RET]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX90A-NEXT: ret float [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_ret( +; GFX11-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4 +; GFX11-NEXT: ret float [[RET]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0:![0-9]+]] +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define float @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret float [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX9-NEXT: ret float [[RET]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX908-NEXT: ret float [[RET]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret float [[RET]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_local_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX11-NEXT: ret float [[RET]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8 +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret double [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret double [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret double [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic + ret double %ret +} + +define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void +} + +define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 { +; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_agent( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @test_atomicrmw_fadd_bf16_flat_agent_align4(ptr %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +define float @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; CI-NEXT: ret bfloat [[TMP5]] +; CI-NEXT: ret float [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP5]] +; GFX9-NEXT: ret float [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP5]] +; GFX908-NEXT: ret float [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_agent_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: ret float [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret float [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") monotonic monotonic, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP5]] +; GFX11-NEXT: ret float [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, bfloat %value syncscope("agent") monotonic, align 4 - ret bfloat %res + %ret = atomicrmw fadd ptr %ptr, float %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret float %ret } -define bfloat @test_atomicrmw_fadd_bf16_flat_system(ptr %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; CI-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; CI-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; CI-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; CI-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; CI-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; CI-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +define void @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; CI-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; CI-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; CI-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; CI-NEXT: ret bfloat [[TMP7]] -; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX9-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX9-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX9-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX9-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX9-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX9-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX9-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX9-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX9-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP7]] -; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX908-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX908-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX908-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX908-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX908-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX908-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX908-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX908-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX908-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP7]] -; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX90A-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX90A-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX90A-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX90A-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX90A-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX90A-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX90A-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX90A-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX90A-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP7]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX940-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX940-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX940-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX940-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX940-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX940-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX940-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX940-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX940-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP7]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_system( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4) -; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64 -; GFX11-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3 -; GFX11-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3 -; GFX11-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32 -; GFX11-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]] -; GFX11-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 -; GFX11-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_noret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 -; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]] -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] -; GFX11-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] -; GFX11-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 -; GFX11-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP7]] +; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic - ret bfloat %res + %unused = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret void } -define bfloat @test_atomicrmw_fadd_bf16_flat_system_align4(ptr %ptr, bfloat %value) { -; CI-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; CI-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +define double @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode(ptr %ptr, double %value) #5 { +; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; CI-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; CI-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; CI-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; CI-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; CI-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CI-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; CI-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; CI-NEXT: ret bfloat [[TMP5]] +; CI-NEXT: ret double [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX9-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX9-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX9-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX9-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX9-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX9-NEXT: ret bfloat [[TMP5]] +; GFX9-NEXT: ret double [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; GFX908-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX908-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX908-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX908-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX908-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX908-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX908-NEXT: ret bfloat [[TMP5]] +; GFX908-NEXT: ret double [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; GFX90A-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX90A-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX90A-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX90A: atomicrmw.end: -; GFX90A-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX90A-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX90A-NEXT: ret bfloat [[TMP5]] -; -; GFX940-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; GFX940-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 -; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX940: atomicrmw.start: -; GFX940-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX940-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX940-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX940-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX940-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX940-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX940: atomicrmw.end: -; GFX940-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX940-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX940-NEXT: ret bfloat [[TMP5]] -; -; GFX11-LABEL: @test_atomicrmw_fadd_bf16_flat_system_align4( -; GFX11-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: ret double [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret double [[RET]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_flat_system_ret__amdgpu_ignore_denormal_mode( +; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 -; GFX11-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 -; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 -; GFX11-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 -; GFX11-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX11-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 -; GFX11-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat -; GFX11-NEXT: ret bfloat [[TMP5]] +; GFX11-NEXT: ret double [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, bfloat %value monotonic, align 4 - ret bfloat %res + %ret = atomicrmw fadd ptr %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0 + ret double %ret +} + +define void @test_atomicrmw_fadd_f32_region_noret(ptr addrspace(2) %ptr, float %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f32_region_noret( +; ALL-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(2) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(2) %ptr, float %value monotonic + ret void +} + +define float @test_atomicrmw_fadd_f32_region_ret(ptr addrspace(2) %ptr, float %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f32_region_ret( +; ALL-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(2) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret float [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(2) %ptr, float %value monotonic + ret float %ret +} + +define void @test_atomicrmw_fadd_f64_region_noret(ptr addrspace(2) %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_region_noret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(2) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %unused = atomicrmw fadd ptr addrspace(2) %ptr, double %value monotonic + ret void +} + +define double @test_atomicrmw_fadd_f64_region_ret(ptr addrspace(2) %ptr, double %value) { +; ALL-LABEL: @test_atomicrmw_fadd_f64_region_ret( +; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(2) [[PTR:%.*]], align 8 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 +; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(2) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret double [[TMP5]] +; + %ret = atomicrmw fadd ptr addrspace(2) %ptr, double %value monotonic + ret double %ret +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent(ptr %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void } attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } +attributes #3 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" } +attributes #5 = { "denormal-fp-math"="dynamic,dynamic" } + +!0 = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll new file mode 100644 index 00000000000000..01a23097008ce1 --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll @@ -0,0 +1,859 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; TODO: atomicrmw xchg +;--------------------------------------------------------------------- + +; ; xchg is supported over PCIe, so no expansion is necessary +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 +; ret <2 x bfloat> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x bfloat> @test_atomicrmw_xchg_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +; ret <2 x bfloat> %res +; } + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2bf16_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x bfloat> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x bfloat> %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll new file mode 100644 index 00000000000000..2a1824b0ca4aef --- /dev/null +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll @@ -0,0 +1,859 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s + +;--------------------------------------------------------------------- +; TODO: atomicrmw xchg +;--------------------------------------------------------------------- + +; ; xchg is supported over PCIe, so no expansion is necessary +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 +; ret <2 x half> %res +; } + +; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. +; define <2 x half> @test_atomicrmw_xchg_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +; ret <2 x half> %res +; } + +;--------------------------------------------------------------------- +; atomicrmw fadd +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access___denormal_fp_mode_v2f16_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_daz( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access__denormal_mode_dynamic( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fsub +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmax +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +;--------------------------------------------------------------------- +; atomicrmw fmin +;--------------------------------------------------------------------- + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[RES]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, <2 x half> %value) { +; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access( +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) +; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 +; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: +; COMMON-NEXT: ret <2 x half> [[TMP6]] +; + %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0, !amdgpu.ignore.denormal.mode !0 + ret <2 x half> %res +} + +attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } + +!0 = !{} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} +; GFX12: {{.*}} +; GFX803: {{.*}} +; GFX906: {{.*}} +; GFX908: {{.*}} +; GFX90A: {{.*}} +; GFX940: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index e32c218e9fe56a..097d7b6ac57717 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -124,6 +124,16 @@ define i32 @test_atomicrmw_xor_0_global_system(ptr addrspace(1) %ptr) { } +define i32 @test_atomicrmw_or_0_global_system__metadata(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define i32 @test_atomicrmw_or_0_global_system__metadata( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = atomicrmw add ptr addrspace(1) [[PTR]], i32 0 seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES]] +; + %res = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + !0 = !{} ;. ; CHECK: [[META0]] = !{} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index bfbe4899bedd10..9ea2db86d7f32a 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -182,4 +182,14 @@ entry: ret void } +; CHECK-LABEL: @atomicrmw_add_global_to_flat_preserve_amdgpu_md( +; CHECK-NEXT: %ret = atomicrmw add ptr addrspace(1) %global.ptr, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 +define i32 @atomicrmw_add_global_to_flat_preserve_amdgpu_md(ptr addrspace(1) %global.ptr, i32 %y) #0 { + %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr + %ret = atomicrmw add ptr %cast, i32 %y seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %ret +} + attributes #0 = { nounwind } + +!0 = !{} diff --git a/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll new file mode 100644 index 00000000000000..ec7edd277dd7ad --- /dev/null +++ b/llvm/test/Transforms/Inline/AMDGPU/inline-atomicrmw-md-preserve.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s + +; Ensure that custom metadata survives inlining + +define i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val) { +; CHECK-LABEL: define i32 @atomic_xor( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES]] +; + %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %val monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0 + ret i32 %res +} + +define i32 @caller(ptr addrspace(1) %ptr, i32 %val) { +; CHECK-LABEL: define i32 @caller( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) { +; CHECK-NEXT: [[RES_I:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VAL]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]] +; CHECK-NEXT: ret i32 [[RES_I]] +; + %res = call i32 @atomic_xor(ptr addrspace(1) %ptr, i32 %val) + ret i32 %res +} + +!0 = !{} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index 39b4ad80550889..56ee54d351e762 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -4091,6 +4091,43 @@ define i32 @fold_zext_addition_fail2(i8 %x) { ret i32 %r } +define i32 @fold_zext_nneg_add_const(i8 %x) { +; CHECK-LABEL: @fold_zext_nneg_add_const( +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nsw i32 [[TMP1]], 98 +; CHECK-NEXT: ret i32 [[R]] +; + %xx = add nsw i8 %x, 123 + %ze = zext nneg i8 %xx to i32 + %r = add nsw i32 %ze, -25 + ret i32 %r +} + +define i32 @fold_zext_nneg_add_const_fail1(i8 %x) { +; CHECK-LABEL: @fold_zext_nneg_add_const_fail1( +; CHECK-NEXT: [[XX:%.*]] = add nsw i8 [[X:%.*]], 123 +; CHECK-NEXT: [[ZE:%.*]] = zext i8 [[XX]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nsw i32 [[ZE]], -25 +; CHECK-NEXT: ret i32 [[R]] +; + %xx = add nsw i8 %x, 123 + %ze = zext i8 %xx to i32 + %r = add nsw i32 %ze, -25 + ret i32 %r +} + +define i32 @fold_zext_nneg_add_const_fail2(i8 %x) { +; CHECK-LABEL: @fold_zext_nneg_add_const_fail2( +; CHECK-NEXT: [[XX:%.*]] = add i8 [[X:%.*]], 123 +; CHECK-NEXT: [[ZE:%.*]] = zext nneg i8 [[XX]] to i32 +; CHECK-NEXT: [[R:%.*]] = add nsw i32 [[ZE]], -25 +; CHECK-NEXT: ret i32 [[R]] +; + %xx = add i8 %x, 123 + %ze = zext nneg i8 %xx to i32 + %r = add nsw i32 %ze, -25 + ret i32 %r +} declare void @llvm.assume(i1) declare void @fake_func(i32) diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll index 389264e2f70759..4d907800219d63 100644 --- a/llvm/test/Transforms/InstCombine/fcmp.ll +++ b/llvm/test/Transforms/InstCombine/fcmp.ll @@ -1486,3 +1486,235 @@ define i1 @fcmp_fadd_fast_zero(float %x, float %y) { %cmp = fcmp ugt float %add, %y ret i1 %cmp } + +define i1 @fcmp_ueq_sel_x_negx(float %x) { +; CHECK-LABEL: @fcmp_ueq_sel_x_negx( +; CHECK-NEXT: [[RES:%.*]] = fcmp ueq float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp ueq float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp ueq float %sel, 0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_une_sel_x_negx(float %x) { +; CHECK-LABEL: @fcmp_une_sel_x_negx( +; CHECK-NEXT: [[RES:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp une float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp une float %sel, 0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_oeq_sel_x_negx(float %x) { +; CHECK-LABEL: @fcmp_oeq_sel_x_negx( +; CHECK-NEXT: [[RES:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp oeq float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp oeq float %sel, 0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_one_sel_x_negx(float %x) { +; CHECK-LABEL: @fcmp_one_sel_x_negx( +; CHECK-NEXT: [[RES:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp one float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp one float %sel, 0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_ueq_sel_x_negx_nzero(float %x) { +; CHECK-LABEL: @fcmp_ueq_sel_x_negx_nzero( +; CHECK-NEXT: [[RES:%.*]] = fcmp ueq float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp ueq float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp ueq float %sel, -0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_une_sel_x_negx_nzero(float %x) { +; CHECK-LABEL: @fcmp_une_sel_x_negx_nzero( +; CHECK-NEXT: [[RES:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp une float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp une float %sel, -0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_oeq_sel_x_negx_nzero(float %x) { +; CHECK-LABEL: @fcmp_oeq_sel_x_negx_nzero( +; CHECK-NEXT: [[RES:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp oeq float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp oeq float %sel, -0.000000e+00 + ret i1 %res +} + +define i1 @fcmp_one_sel_x_negx_nzero(float %x) { +; CHECK-LABEL: @fcmp_one_sel_x_negx_nzero( +; CHECK-NEXT: [[RES:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RES]] +; + %f = fcmp one float %x, 0.000000e+00 + %neg = fneg float %x + %sel = select i1 %f, float %x, float %neg + %res = fcmp one float %sel, -0.000000e+00 + ret i1 %res +} + +define <8 x i1> @fcmp_ueq_sel_x_negx_vec(<8 x float> %x) { +; CHECK-LABEL: @fcmp_ueq_sel_x_negx_vec( +; CHECK-NEXT: [[RES:%.*]] = fcmp ueq <8 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[RES]] +; + %f = fcmp ueq <8 x float> %x, zeroinitializer + %neg = fneg <8 x float> %x + %sel = select <8 x i1> %f, <8 x float> %x, <8 x float> %neg + %res = fcmp ueq <8 x float> %sel, zeroinitializer + ret <8 x i1> %res +} + +define <8 x i1> @fcmp_une_sel_x_negx_vec(<8 x float> %x) { +; CHECK-LABEL: @fcmp_une_sel_x_negx_vec( +; CHECK-NEXT: [[RES:%.*]] = fcmp une <8 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[RES]] +; + %f = fcmp une <8 x float> %x, zeroinitializer + %neg = fneg <8 x float> %x + %sel = select <8 x i1> %f, <8 x float> %x, <8 x float> %neg + %res = fcmp une <8 x float> %sel, zeroinitializer + ret <8 x i1> %res +} + +define <8 x i1> @fcmp_oeq_sel_x_negx_vec(<8 x float> %x) { +; CHECK-LABEL: @fcmp_oeq_sel_x_negx_vec( +; CHECK-NEXT: [[RES:%.*]] = fcmp oeq <8 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[RES]] +; + %f = fcmp oeq <8 x float> %x, zeroinitializer + %neg = fneg <8 x float> %x + %sel = select <8 x i1> %f, <8 x float> %x, <8 x float> %neg + %res = fcmp oeq <8 x float> %sel, zeroinitializer + ret <8 x i1> %res +} + +define <8 x i1> @fcmp_one_sel_x_negx_vec(<8 x float> %x) { +; CHECK-LABEL: @fcmp_one_sel_x_negx_vec( +; CHECK-NEXT: [[RES:%.*]] = fcmp one <8 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <8 x i1> [[RES]] +; + %f = fcmp one <8 x float> %x, zeroinitializer + %neg = fneg <8 x float> %x + %sel = select <8 x i1> %f, <8 x float> %x, <8 x float> %neg + %res = fcmp one <8 x float> %sel, zeroinitializer + ret <8 x i1> %res +} + +define <2 x i1> @fcmp_oeq_sel_x_negx_with_any_fpzero_ninf_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_oeq_sel_x_negx_with_any_fpzero_ninf_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp ninf oeq <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp ninf oeq <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_one_sel_x_negx_with_any_fpzero_ninf_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_one_sel_x_negx_with_any_fpzero_ninf_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp ninf one <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp ninf one <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_ueq_sel_x_negx_with_any_fpzero_ninf_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_ueq_sel_x_negx_with_any_fpzero_ninf_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp ninf ueq <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp ninf ueq <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_une_sel_x_negx_with_any_fpzero_ninf_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_une_sel_x_negx_with_any_fpzero_ninf_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp ninf une <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp ninf une <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_oeq_sel_x_negx_with_any_fpzero_nnan_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_oeq_sel_x_negx_with_any_fpzero_nnan_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp nnan oeq <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp nnan oeq <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_one_sel_x_negx_with_any_fpzero_nnan_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_one_sel_x_negx_with_any_fpzero_nnan_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp nnan one <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp nnan one <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_ueq_sel_x_negx_with_any_fpzero_nnan_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_ueq_sel_x_negx_with_any_fpzero_nnan_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp nnan ueq <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp nnan ueq <2 x float> %sel, + ret <2 x i1> %icmp +} + +define <2 x i1> @fcmp_une_sel_x_negx_with_any_fpzero_nnan_vec(<2 x i1> %cond, <2 x float> %x) { +; CHECK-LABEL: @fcmp_une_sel_x_negx_with_any_fpzero_nnan_vec( +; CHECK-NEXT: [[ICMP:%.*]] = fcmp nnan une <2 x float> [[X:%.*]], zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[ICMP]] +; + %fneg = fneg <2 x float> %x + %sel = select <2 x i1> %cond, <2 x float> %x, <2 x float> %fneg + %icmp = fcmp nnan une <2 x float> %sel, + ret <2 x i1> %icmp +} diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll index 6c0575e8b71971..d76f92c1849af9 100644 --- a/llvm/test/Transforms/InstCombine/powi.ll +++ b/llvm/test/Transforms/InstCombine/powi.ll @@ -401,6 +401,121 @@ define double @fdiv_pow_powi_negative_variable(double %x, i32 %y) { ret double %div } +; powi(X,C1)/ (X * Z) --> powi(X,C1 - 1)/ Z +define double @fdiv_fmul_powi(double %a, double %z) { +; CHECK-LABEL: @fdiv_fmul_powi( +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nnan double @llvm.powi.f64.i32(double [[A:%.*]], i32 4) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc nnan double [[TMP1]], [[Z:%.*]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul reassoc double %z, %a + %div = fdiv reassoc nnan double %pow, %square + ret double %div +} + +; powi(X, 5)/ (X * X) --> powi(X, 4)/ X -> powi(X, 3) +define double @fdiv_fmul_powi_2(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_2( +; CHECK-NEXT: [[DIV:%.*]] = call reassoc nnan double @llvm.powi.f64.i32(double [[A:%.*]], i32 3) +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul reassoc double %a, %a + %div = fdiv reassoc nnan double %pow, %square + ret double %div +} + +define <2 x float> @fdiv_fmul_powi_vector(<2 x float> %a) { +; CHECK-LABEL: @fdiv_fmul_powi_vector( +; CHECK-NEXT: [[DIV:%.*]] = call reassoc nnan <2 x float> @llvm.powi.v2f32.i32(<2 x float> [[A:%.*]], i32 3) +; CHECK-NEXT: ret <2 x float> [[DIV]] +; + %pow = call reassoc <2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 5) + %square = fmul reassoc <2 x float> %a, %a + %div = fdiv reassoc nnan <2 x float> %pow, %square + ret <2 x float> %div +} + +; Negative test +define double @fdiv_fmul_powi_missing_reassoc1(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_missing_reassoc1( +; CHECK-NEXT: [[POW:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[A:%.*]], i32 5) +; CHECK-NEXT: [[SQUARE:%.*]] = fmul reassoc double [[A]], [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv nnan double [[POW]], [[SQUARE]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul reassoc double %a, %a + %div = fdiv nnan double %pow, %square + ret double %div +} + +define double @fdiv_fmul_powi_missing_reassoc2(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_missing_reassoc2( +; CHECK-NEXT: [[POW:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[A:%.*]], i32 5) +; CHECK-NEXT: [[SQUARE:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc nnan double [[POW]], [[SQUARE]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul double %a, %a + %div = fdiv reassoc nnan double %pow, %square + ret double %div +} + +define double @fdiv_fmul_powi_missing_reassoc3(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_missing_reassoc3( +; CHECK-NEXT: [[POW:%.*]] = call double @llvm.powi.f64.i32(double [[A:%.*]], i32 5) +; CHECK-NEXT: [[SQUARE:%.*]] = fmul reassoc double [[A]], [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc nnan double [[POW]], [[SQUARE]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul reassoc double %a, %a + %div = fdiv reassoc nnan double %pow, %square + ret double %div +} + +define double @fdiv_fmul_powi_missing_nnan(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_missing_nnan( +; CHECK-NEXT: [[POW:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[A:%.*]], i32 5) +; CHECK-NEXT: [[SQUARE:%.*]] = fmul reassoc double [[A]], [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc double [[POW]], [[SQUARE]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + %square = fmul reassoc double %a, %a + %div = fdiv reassoc double %pow, %square + ret double %div +} + +define double @fdiv_fmul_powi_negative_wrap(double noundef %x) { +; CHECK-LABEL: @fdiv_fmul_powi_negative_wrap( +; CHECK-NEXT: [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 -2147483648) +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[P1]], [[X]] +; CHECK-NEXT: ret double [[MUL]] +; + %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 -2147483648) ; INT_MIN + %mul = fmul reassoc double %p1, %x + ret double %mul +} + +define double @fdiv_fmul_powi_multi_use(double %a) { +; CHECK-LABEL: @fdiv_fmul_powi_multi_use( +; CHECK-NEXT: [[POW:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[A:%.*]], i32 5) +; CHECK-NEXT: tail call void @use(double [[POW]]) +; CHECK-NEXT: [[SQUARE:%.*]] = fmul reassoc double [[A]], [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc nnan double [[POW]], [[SQUARE]] +; CHECK-NEXT: ret double [[DIV]] +; + %pow = call reassoc double @llvm.powi.f64.i32(double %a, i32 5) + tail call void @use(double %pow) + %square = fmul reassoc double %a, %a + %div = fdiv reassoc nnan double %pow, %square + ret double %div +} + ; powi(X, Y) * X --> powi(X, Y+1) define double @powi_fmul_powi_x(double noundef %x) { ; CHECK-LABEL: @powi_fmul_powi_x( diff --git a/llvm/test/Transforms/InstCombine/update-bpi.ll b/llvm/test/Transforms/InstCombine/update-bpi.ll index 190a686c8f078b..fadb2ab16bff6f 100644 --- a/llvm/test/Transforms/InstCombine/update-bpi.ll +++ b/llvm/test/Transforms/InstCombine/update-bpi.ll @@ -6,8 +6,8 @@ ; CHECK-NEXT: edge %entry -> %bb2 probability is 0x79e79e7a / 0x80000000 = 95.24% [HOT edge] ; CHECK-NEXT: Printing analysis 'Branch Probability Analysis' for function 'invert_cond': ; CHECK-NEXT: ---- Branch Probabilities ---- -; CHECK-NEXT: edge %entry -> %bb2 probability is 0x06186186 / 0x80000000 = 4.76% -; CHECK-NEXT: edge %entry -> %bb1 probability is 0x79e79e7a / 0x80000000 = 95.24% [HOT edge] +; CHECK-NEXT: edge %entry -> %bb2 probability is 0x79e79e7a / 0x80000000 = 95.24% [HOT edge] +; CHECK-NEXT: edge %entry -> %bb1 probability is 0x06186186 / 0x80000000 = 4.76% define i32 @invert_cond(ptr %p) { ; CHECK-LABEL: define i32 @invert_cond( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll index 835ff375688173..ae01bdd3711060 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll @@ -26,7 +26,6 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; IF-EVL-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv2i64() @@ -36,9 +35,7 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; IF-EVL-NEXT: [[TMP16:%.*]] = mul i64 1, [[TMP15]] -; IF-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 -; IF-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: @@ -47,17 +44,16 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true) -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP20]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP21]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP22]], [[TMP19]], i32 [[TMP18]]) +; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP22]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP18]]) ; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP18]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 72b881bd44c768..8caa9368bfde18 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -27,14 +27,14 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> -; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> -; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> -; IF-EVL-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir +; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll new file mode 100644 index 00000000000000..b49c0ad9976654 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleavevectorization.ll @@ -0,0 +1,920 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='default' -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i]; +; x[i+1] += y[i+1]; +; x[i+2] += y[i+2]; +; x[i+3] += y[i+3]; +; } +define void @add4(ptr noalias noundef %x, ptr noalias noundef %y, i32 noundef %n) { +; CHECK-LABEL: @add4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %x.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %add = add nsw i32 %conv3, %conv + %conv4 = trunc i32 %add to i16 + store i16 %conv4, ptr %arrayidx2, align 2 + %7 = load ptr, ptr %y.addr, align 8 + %8 = load i32, ptr %i, align 4 + %add5 = add nsw i32 %8, 1 + %idxprom6 = sext i32 %add5 to i64 + %arrayidx7 = getelementptr inbounds i16, ptr %7, i64 %idxprom6 + %9 = load i16, ptr %arrayidx7, align 2 + %conv8 = sext i16 %9 to i32 + %10 = load ptr, ptr %x.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add9 = add nsw i32 %11, 1 + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i16, ptr %10, i64 %idxprom10 + %12 = load i16, ptr %arrayidx11, align 2 + %conv12 = sext i16 %12 to i32 + %add13 = add nsw i32 %conv12, %conv8 + %conv14 = trunc i32 %add13 to i16 + store i16 %conv14, ptr %arrayidx11, align 2 + %13 = load ptr, ptr %y.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add15 = add nsw i32 %14, 2 + %idxprom16 = sext i32 %add15 to i64 + %arrayidx17 = getelementptr inbounds i16, ptr %13, i64 %idxprom16 + %15 = load i16, ptr %arrayidx17, align 2 + %conv18 = sext i16 %15 to i32 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add19 = add nsw i32 %17, 2 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds i16, ptr %16, i64 %idxprom20 + %18 = load i16, ptr %arrayidx21, align 2 + %conv22 = sext i16 %18 to i32 + %add23 = add nsw i32 %conv22, %conv18 + %conv24 = trunc i32 %add23 to i16 + store i16 %conv24, ptr %arrayidx21, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add25 = add nsw i32 %20, 3 + %idxprom26 = sext i32 %add25 to i64 + %arrayidx27 = getelementptr inbounds i16, ptr %19, i64 %idxprom26 + %21 = load i16, ptr %arrayidx27, align 2 + %conv28 = sext i16 %21 to i32 + %22 = load ptr, ptr %x.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add29 = add nsw i32 %23, 3 + %idxprom30 = sext i32 %add29 to i64 + %arrayidx31 = getelementptr inbounds i16, ptr %22, i64 %idxprom30 + %24 = load i16, ptr %arrayidx31, align 2 + %conv32 = sext i16 %24 to i32 + %add33 = add nsw i32 %conv32, %conv28 + %conv34 = trunc i32 %add33 to i16 + store i16 %conv34, ptr %arrayidx31, align 2 + br label %for.inc + +for.inc: + %25 = load i32, ptr %i, align 4 + %add35 = add nsw i32 %25, 4 + store i32 %add35, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i]; +; x[i+1] -= y[i+1]; +; x[i+2] += y[i+2]; +; x[i+3] -= y[i+3]; +; } +define void @addsubs(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) { +; CHECK-LABEL: @addsubs( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %x.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %add = add nsw i32 %conv3, %conv + %conv4 = trunc i32 %add to i16 + store i16 %conv4, ptr %arrayidx2, align 2 + %7 = load ptr, ptr %y.addr, align 8 + %8 = load i32, ptr %i, align 4 + %add5 = add nsw i32 %8, 1 + %idxprom6 = sext i32 %add5 to i64 + %arrayidx7 = getelementptr inbounds i16, ptr %7, i64 %idxprom6 + %9 = load i16, ptr %arrayidx7, align 2 + %conv8 = sext i16 %9 to i32 + %10 = load ptr, ptr %x.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add9 = add nsw i32 %11, 1 + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i16, ptr %10, i64 %idxprom10 + %12 = load i16, ptr %arrayidx11, align 2 + %conv12 = sext i16 %12 to i32 + %sub = sub nsw i32 %conv12, %conv8 + %conv13 = trunc i32 %sub to i16 + store i16 %conv13, ptr %arrayidx11, align 2 + %13 = load ptr, ptr %y.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add14 = add nsw i32 %14, 2 + %idxprom15 = sext i32 %add14 to i64 + %arrayidx16 = getelementptr inbounds i16, ptr %13, i64 %idxprom15 + %15 = load i16, ptr %arrayidx16, align 2 + %conv17 = sext i16 %15 to i32 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add18 = add nsw i32 %17, 2 + %idxprom19 = sext i32 %add18 to i64 + %arrayidx20 = getelementptr inbounds i16, ptr %16, i64 %idxprom19 + %18 = load i16, ptr %arrayidx20, align 2 + %conv21 = sext i16 %18 to i32 + %add22 = add nsw i32 %conv21, %conv17 + %conv23 = trunc i32 %add22 to i16 + store i16 %conv23, ptr %arrayidx20, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add24 = add nsw i32 %20, 3 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds i16, ptr %19, i64 %idxprom25 + %21 = load i16, ptr %arrayidx26, align 2 + %conv27 = sext i16 %21 to i32 + %22 = load ptr, ptr %x.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add28 = add nsw i32 %23, 3 + %idxprom29 = sext i32 %add28 to i64 + %arrayidx30 = getelementptr inbounds i16, ptr %22, i64 %idxprom29 + %24 = load i16, ptr %arrayidx30, align 2 + %conv31 = sext i16 %24 to i32 + %sub32 = sub nsw i32 %conv31, %conv27 + %conv33 = trunc i32 %sub32 to i16 + store i16 %conv33, ptr %arrayidx30, align 2 + br label %for.inc + +for.inc: + %25 = load i32, ptr %i, align 4 + %add34 = add nsw i32 %25, 4 + store i32 %add34, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i]; +; x[i+1] += y[i+1]; +; x[i+2] -= y[i+2]; +; x[i+3] -= y[i+3]; +; } +define void @add2sub2(ptr noalias noundef %x, ptr noundef %y, i32 noundef %n) { +; CHECK-LABEL: @add2sub2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = sub <32 x i16> [[WIDE_VEC24]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> [[TMP12]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %x.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %add = add nsw i32 %conv3, %conv + %conv4 = trunc i32 %add to i16 + store i16 %conv4, ptr %arrayidx2, align 2 + %7 = load ptr, ptr %y.addr, align 8 + %8 = load i32, ptr %i, align 4 + %add5 = add nsw i32 %8, 1 + %idxprom6 = sext i32 %add5 to i64 + %arrayidx7 = getelementptr inbounds i16, ptr %7, i64 %idxprom6 + %9 = load i16, ptr %arrayidx7, align 2 + %conv8 = sext i16 %9 to i32 + %10 = load ptr, ptr %x.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add9 = add nsw i32 %11, 1 + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i16, ptr %10, i64 %idxprom10 + %12 = load i16, ptr %arrayidx11, align 2 + %conv12 = sext i16 %12 to i32 + %add13 = add nsw i32 %conv12, %conv8 + %conv14 = trunc i32 %add13 to i16 + store i16 %conv14, ptr %arrayidx11, align 2 + %13 = load ptr, ptr %y.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add15 = add nsw i32 %14, 2 + %idxprom16 = sext i32 %add15 to i64 + %arrayidx17 = getelementptr inbounds i16, ptr %13, i64 %idxprom16 + %15 = load i16, ptr %arrayidx17, align 2 + %conv18 = sext i16 %15 to i32 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add19 = add nsw i32 %17, 2 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds i16, ptr %16, i64 %idxprom20 + %18 = load i16, ptr %arrayidx21, align 2 + %conv22 = sext i16 %18 to i32 + %sub = sub nsw i32 %conv22, %conv18 + %conv23 = trunc i32 %sub to i16 + store i16 %conv23, ptr %arrayidx21, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add24 = add nsw i32 %20, 3 + %idxprom25 = sext i32 %add24 to i64 + %arrayidx26 = getelementptr inbounds i16, ptr %19, i64 %idxprom25 + %21 = load i16, ptr %arrayidx26, align 2 + %conv27 = sext i16 %21 to i32 + %22 = load ptr, ptr %x.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add28 = add nsw i32 %23, 3 + %idxprom29 = sext i32 %add28 to i64 + %arrayidx30 = getelementptr inbounds i16, ptr %22, i64 %idxprom29 + %24 = load i16, ptr %arrayidx30, align 2 + %conv31 = sext i16 %24 to i32 + %sub32 = sub nsw i32 %conv31, %conv27 + %conv33 = trunc i32 %sub32 to i16 + store i16 %conv33, ptr %arrayidx30, align 2 + br label %for.inc + +for.inc: + %25 = load i32, ptr %i, align 4 + %add34 = add nsw i32 %25, 4 + store i32 %add34, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i] * z[i]; +; x[i+1] += y[i+1] * z[i+1]; +; x[i+2] += y[i+2] * z[i+2]; +; x[i+3] += y[i+3] * z[i+3]; +; } +define void @addmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32 noundef %n) { +; CHECK-LABEL: @addmul( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = add <32 x i16> [[TMP6]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = add <32 x i16> [[TMP9]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP14:%.*]] = add <32 x i16> [[TMP13]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %z.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store ptr %z, ptr %z.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %z.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %mul = mul nsw i32 %conv, %conv3 + %7 = load ptr, ptr %x.addr, align 8 + %8 = load i32, ptr %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds i16, ptr %7, i64 %idxprom4 + %9 = load i16, ptr %arrayidx5, align 2 + %conv6 = sext i16 %9 to i32 + %add = add nsw i32 %conv6, %mul + %conv7 = trunc i32 %add to i16 + store i16 %conv7, ptr %arrayidx5, align 2 + %10 = load ptr, ptr %y.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add8 = add nsw i32 %11, 1 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds i16, ptr %10, i64 %idxprom9 + %12 = load i16, ptr %arrayidx10, align 2 + %conv11 = sext i16 %12 to i32 + %13 = load ptr, ptr %z.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add12 = add nsw i32 %14, 1 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds i16, ptr %13, i64 %idxprom13 + %15 = load i16, ptr %arrayidx14, align 2 + %conv15 = sext i16 %15 to i32 + %mul16 = mul nsw i32 %conv11, %conv15 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add17 = add nsw i32 %17, 1 + %idxprom18 = sext i32 %add17 to i64 + %arrayidx19 = getelementptr inbounds i16, ptr %16, i64 %idxprom18 + %18 = load i16, ptr %arrayidx19, align 2 + %conv20 = sext i16 %18 to i32 + %add21 = add nsw i32 %conv20, %mul16 + %conv22 = trunc i32 %add21 to i16 + store i16 %conv22, ptr %arrayidx19, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add23 = add nsw i32 %20, 2 + %idxprom24 = sext i32 %add23 to i64 + %arrayidx25 = getelementptr inbounds i16, ptr %19, i64 %idxprom24 + %21 = load i16, ptr %arrayidx25, align 2 + %conv26 = sext i16 %21 to i32 + %22 = load ptr, ptr %z.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add27 = add nsw i32 %23, 2 + %idxprom28 = sext i32 %add27 to i64 + %arrayidx29 = getelementptr inbounds i16, ptr %22, i64 %idxprom28 + %24 = load i16, ptr %arrayidx29, align 2 + %conv30 = sext i16 %24 to i32 + %mul31 = mul nsw i32 %conv26, %conv30 + %25 = load ptr, ptr %x.addr, align 8 + %26 = load i32, ptr %i, align 4 + %add32 = add nsw i32 %26, 2 + %idxprom33 = sext i32 %add32 to i64 + %arrayidx34 = getelementptr inbounds i16, ptr %25, i64 %idxprom33 + %27 = load i16, ptr %arrayidx34, align 2 + %conv35 = sext i16 %27 to i32 + %add36 = add nsw i32 %conv35, %mul31 + %conv37 = trunc i32 %add36 to i16 + store i16 %conv37, ptr %arrayidx34, align 2 + %28 = load ptr, ptr %y.addr, align 8 + %29 = load i32, ptr %i, align 4 + %add38 = add nsw i32 %29, 3 + %idxprom39 = sext i32 %add38 to i64 + %arrayidx40 = getelementptr inbounds i16, ptr %28, i64 %idxprom39 + %30 = load i16, ptr %arrayidx40, align 2 + %conv41 = sext i16 %30 to i32 + %31 = load ptr, ptr %z.addr, align 8 + %32 = load i32, ptr %i, align 4 + %add42 = add nsw i32 %32, 3 + %idxprom43 = sext i32 %add42 to i64 + %arrayidx44 = getelementptr inbounds i16, ptr %31, i64 %idxprom43 + %33 = load i16, ptr %arrayidx44, align 2 + %conv45 = sext i16 %33 to i32 + %mul46 = mul nsw i32 %conv41, %conv45 + %34 = load ptr, ptr %x.addr, align 8 + %35 = load i32, ptr %i, align 4 + %add47 = add nsw i32 %35, 3 + %idxprom48 = sext i32 %add47 to i64 + %arrayidx49 = getelementptr inbounds i16, ptr %34, i64 %idxprom48 + %36 = load i16, ptr %arrayidx49, align 2 + %conv50 = sext i16 %36 to i32 + %add51 = add nsw i32 %conv50, %mul46 + %conv52 = trunc i32 %add51 to i16 + store i16 %conv52, ptr %arrayidx49, align 2 + br label %for.inc + +for.inc: + %37 = load i32, ptr %i, align 4 + %add53 = add nsw i32 %37, 4 + store i32 %add53, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i] * z[i]; +; x[i+1] -= y[i+1] * z[i+1]; +; x[i+2] += y[i+2] * z[i+2]; +; x[i+3] -= y[i+3] * z[i+3]; +; } +define void @addsubsmul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32 noundef %n) { +; CHECK-LABEL: @addsubsmul( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = add <32 x i16> [[TMP9]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP14:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %z.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store ptr %z, ptr %z.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %z.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %mul = mul nsw i32 %conv, %conv3 + %7 = load ptr, ptr %x.addr, align 8 + %8 = load i32, ptr %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds i16, ptr %7, i64 %idxprom4 + %9 = load i16, ptr %arrayidx5, align 2 + %conv6 = sext i16 %9 to i32 + %add = add nsw i32 %conv6, %mul + %conv7 = trunc i32 %add to i16 + store i16 %conv7, ptr %arrayidx5, align 2 + %10 = load ptr, ptr %y.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add8 = add nsw i32 %11, 1 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds i16, ptr %10, i64 %idxprom9 + %12 = load i16, ptr %arrayidx10, align 2 + %conv11 = sext i16 %12 to i32 + %13 = load ptr, ptr %z.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add12 = add nsw i32 %14, 1 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds i16, ptr %13, i64 %idxprom13 + %15 = load i16, ptr %arrayidx14, align 2 + %conv15 = sext i16 %15 to i32 + %mul16 = mul nsw i32 %conv11, %conv15 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add17 = add nsw i32 %17, 1 + %idxprom18 = sext i32 %add17 to i64 + %arrayidx19 = getelementptr inbounds i16, ptr %16, i64 %idxprom18 + %18 = load i16, ptr %arrayidx19, align 2 + %conv20 = sext i16 %18 to i32 + %sub = sub nsw i32 %conv20, %mul16 + %conv21 = trunc i32 %sub to i16 + store i16 %conv21, ptr %arrayidx19, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add22 = add nsw i32 %20, 2 + %idxprom23 = sext i32 %add22 to i64 + %arrayidx24 = getelementptr inbounds i16, ptr %19, i64 %idxprom23 + %21 = load i16, ptr %arrayidx24, align 2 + %conv25 = sext i16 %21 to i32 + %22 = load ptr, ptr %z.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add26 = add nsw i32 %23, 2 + %idxprom27 = sext i32 %add26 to i64 + %arrayidx28 = getelementptr inbounds i16, ptr %22, i64 %idxprom27 + %24 = load i16, ptr %arrayidx28, align 2 + %conv29 = sext i16 %24 to i32 + %mul30 = mul nsw i32 %conv25, %conv29 + %25 = load ptr, ptr %x.addr, align 8 + %26 = load i32, ptr %i, align 4 + %add31 = add nsw i32 %26, 2 + %idxprom32 = sext i32 %add31 to i64 + %arrayidx33 = getelementptr inbounds i16, ptr %25, i64 %idxprom32 + %27 = load i16, ptr %arrayidx33, align 2 + %conv34 = sext i16 %27 to i32 + %add35 = add nsw i32 %conv34, %mul30 + %conv36 = trunc i32 %add35 to i16 + store i16 %conv36, ptr %arrayidx33, align 2 + %28 = load ptr, ptr %y.addr, align 8 + %29 = load i32, ptr %i, align 4 + %add37 = add nsw i32 %29, 3 + %idxprom38 = sext i32 %add37 to i64 + %arrayidx39 = getelementptr inbounds i16, ptr %28, i64 %idxprom38 + %30 = load i16, ptr %arrayidx39, align 2 + %conv40 = sext i16 %30 to i32 + %31 = load ptr, ptr %z.addr, align 8 + %32 = load i32, ptr %i, align 4 + %add41 = add nsw i32 %32, 3 + %idxprom42 = sext i32 %add41 to i64 + %arrayidx43 = getelementptr inbounds i16, ptr %31, i64 %idxprom42 + %33 = load i16, ptr %arrayidx43, align 2 + %conv44 = sext i16 %33 to i32 + %mul45 = mul nsw i32 %conv40, %conv44 + %34 = load ptr, ptr %x.addr, align 8 + %35 = load i32, ptr %i, align 4 + %add46 = add nsw i32 %35, 3 + %idxprom47 = sext i32 %add46 to i64 + %arrayidx48 = getelementptr inbounds i16, ptr %34, i64 %idxprom47 + %36 = load i16, ptr %arrayidx48, align 2 + %conv49 = sext i16 %36 to i32 + %sub50 = sub nsw i32 %conv49, %mul45 + %conv51 = trunc i32 %sub50 to i16 + store i16 %conv51, ptr %arrayidx48, align 2 + br label %for.inc + +for.inc: + %37 = load i32, ptr %i, align 4 + %add52 = add nsw i32 %37, 4 + store i32 %add52, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; for(int i =0; i < 1024; i+=4) { +; x[i] += y[i] * z[i]; +; x[i+1] += y[i+1] * z[i+1]; +; x[i+2] -= y[i+2] * z[i+2]; +; x[i+3] -= y[i+3] * z[i+3]; +; } +define void @add2sub2mul(ptr noalias noundef %x, ptr noundef %y, ptr noundef %z, i32 noundef %n) { +; CHECK-LABEL: @add2sub2mul( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 -6 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i16>, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Z:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC31:%.*]] = load <32 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC36:%.*]] = load <32 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i16> [[TMP2]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = add <32 x i16> [[TMP6]], [[WIDE_VEC36]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i16> [[TMP7]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <32 x i16> [[TMP10]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = mul <32 x i16> [[WIDE_VEC31]], [[WIDE_VEC]] +; CHECK-NEXT: [[TMP14:%.*]] = sub <32 x i16> [[WIDE_VEC36]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i16> [[TMP14]], <32 x i16> poison, <8 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[INVARIANT_GEP]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> [[TMP17]], <32 x i32> +; CHECK-NEXT: store <32 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %z.addr = alloca ptr, align 8 + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store ptr %x, ptr %x.addr, align 8 + store ptr %y, ptr %y.addr, align 8 + store ptr %z, ptr %z.addr, align 8 + store i32 %n, ptr %n.addr, align 4 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %0, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %1 = load ptr, ptr %y.addr, align 8 + %2 = load i32, ptr %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds i16, ptr %1, i64 %idxprom + %3 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %3 to i32 + %4 = load ptr, ptr %z.addr, align 8 + %5 = load i32, ptr %i, align 4 + %idxprom1 = sext i32 %5 to i64 + %arrayidx2 = getelementptr inbounds i16, ptr %4, i64 %idxprom1 + %6 = load i16, ptr %arrayidx2, align 2 + %conv3 = sext i16 %6 to i32 + %mul = mul nsw i32 %conv, %conv3 + %7 = load ptr, ptr %x.addr, align 8 + %8 = load i32, ptr %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds i16, ptr %7, i64 %idxprom4 + %9 = load i16, ptr %arrayidx5, align 2 + %conv6 = sext i16 %9 to i32 + %add = add nsw i32 %conv6, %mul + %conv7 = trunc i32 %add to i16 + store i16 %conv7, ptr %arrayidx5, align 2 + %10 = load ptr, ptr %y.addr, align 8 + %11 = load i32, ptr %i, align 4 + %add8 = add nsw i32 %11, 1 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds i16, ptr %10, i64 %idxprom9 + %12 = load i16, ptr %arrayidx10, align 2 + %conv11 = sext i16 %12 to i32 + %13 = load ptr, ptr %z.addr, align 8 + %14 = load i32, ptr %i, align 4 + %add12 = add nsw i32 %14, 1 + %idxprom13 = sext i32 %add12 to i64 + %arrayidx14 = getelementptr inbounds i16, ptr %13, i64 %idxprom13 + %15 = load i16, ptr %arrayidx14, align 2 + %conv15 = sext i16 %15 to i32 + %mul16 = mul nsw i32 %conv11, %conv15 + %16 = load ptr, ptr %x.addr, align 8 + %17 = load i32, ptr %i, align 4 + %add17 = add nsw i32 %17, 1 + %idxprom18 = sext i32 %add17 to i64 + %arrayidx19 = getelementptr inbounds i16, ptr %16, i64 %idxprom18 + %18 = load i16, ptr %arrayidx19, align 2 + %conv20 = sext i16 %18 to i32 + %add21 = add nsw i32 %conv20, %mul16 + %conv22 = trunc i32 %add21 to i16 + store i16 %conv22, ptr %arrayidx19, align 2 + %19 = load ptr, ptr %y.addr, align 8 + %20 = load i32, ptr %i, align 4 + %add23 = add nsw i32 %20, 2 + %idxprom24 = sext i32 %add23 to i64 + %arrayidx25 = getelementptr inbounds i16, ptr %19, i64 %idxprom24 + %21 = load i16, ptr %arrayidx25, align 2 + %conv26 = sext i16 %21 to i32 + %22 = load ptr, ptr %z.addr, align 8 + %23 = load i32, ptr %i, align 4 + %add27 = add nsw i32 %23, 2 + %idxprom28 = sext i32 %add27 to i64 + %arrayidx29 = getelementptr inbounds i16, ptr %22, i64 %idxprom28 + %24 = load i16, ptr %arrayidx29, align 2 + %conv30 = sext i16 %24 to i32 + %mul31 = mul nsw i32 %conv26, %conv30 + %25 = load ptr, ptr %x.addr, align 8 + %26 = load i32, ptr %i, align 4 + %add32 = add nsw i32 %26, 2 + %idxprom33 = sext i32 %add32 to i64 + %arrayidx34 = getelementptr inbounds i16, ptr %25, i64 %idxprom33 + %27 = load i16, ptr %arrayidx34, align 2 + %conv35 = sext i16 %27 to i32 + %sub = sub nsw i32 %conv35, %mul31 + %conv36 = trunc i32 %sub to i16 + store i16 %conv36, ptr %arrayidx34, align 2 + %28 = load ptr, ptr %y.addr, align 8 + %29 = load i32, ptr %i, align 4 + %add37 = add nsw i32 %29, 3 + %idxprom38 = sext i32 %add37 to i64 + %arrayidx39 = getelementptr inbounds i16, ptr %28, i64 %idxprom38 + %30 = load i16, ptr %arrayidx39, align 2 + %conv40 = sext i16 %30 to i32 + %31 = load ptr, ptr %z.addr, align 8 + %32 = load i32, ptr %i, align 4 + %add41 = add nsw i32 %32, 3 + %idxprom42 = sext i32 %add41 to i64 + %arrayidx43 = getelementptr inbounds i16, ptr %31, i64 %idxprom42 + %33 = load i16, ptr %arrayidx43, align 2 + %conv44 = sext i16 %33 to i32 + %mul45 = mul nsw i32 %conv40, %conv44 + %34 = load ptr, ptr %x.addr, align 8 + %35 = load i32, ptr %i, align 4 + %add46 = add nsw i32 %35, 3 + %idxprom47 = sext i32 %add46 to i64 + %arrayidx48 = getelementptr inbounds i16, ptr %34, i64 %idxprom47 + %36 = load i16, ptr %arrayidx48, align 2 + %conv49 = sext i16 %36 to i32 + %sub50 = sub nsw i32 %conv49, %mul45 + %conv51 = trunc i32 %sub50 to i16 + store i16 %conv51, ptr %arrayidx48, align 2 + br label %for.inc + +for.inc: + %37 = load i32, ptr %i, align 4 + %add52 = add nsw i32 %37, 4 + store i32 %add52, ptr %i, align 4 + br label %for.cond + +for.end: + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll new file mode 100644 index 00000000000000..b03eb9e67254be --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s + +define i64 @test(ptr %p) { +; CHECK-LABEL: define i64 @test( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 12 +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i8> +; CHECK-NEXT: store <4 x i8> [[TMP12]], ptr [[TMP1]], align 1 +; CHECK-NEXT: ret i64 0 +; + %1 = getelementptr i8, ptr %p, i64 13 + %2 = getelementptr i8, ptr %p, i64 14 + %3 = getelementptr i8, ptr %p, i64 15 + %4 = getelementptr i8, ptr %p, i64 12 + %5 = zext i8 0 to i32 + %6 = and i32 %5, 0 + %.not866 = icmp eq i32 %6, 0 + %7 = select i1 %.not866, i32 0, i32 0 + %8 = xor i32 0, %7 + %9 = zext i8 0 to i32 + %10 = and i32 %9, 0 + %.not871 = icmp eq i32 %10, 0 + %11 = select i1 %.not871, i32 0, i32 0 + %12 = xor i32 0, %11 + %13 = xor i32 %9, 0 + %14 = xor i32 %13, 0 + %15 = xor i32 %14, 0 + %16 = xor i32 %15, 0 + %17 = xor i32 %16, 0 + %18 = xor i32 %17, %12 + %19 = xor i32 %18, 0 + %20 = xor i32 %19, 0 + %21 = xor i32 %20, 0 + %22 = xor i32 %21, 0 + %23 = trunc i32 %22 to i8 + store i8 %23, ptr %4, align 1 + %24 = xor i32 %9, 0 + %25 = xor i32 %24, 0 + %26 = xor i32 %25, 0 + %27 = xor i32 %26, 0 + %28 = xor i32 %27, 0 + %29 = xor i32 %28, %8 + %30 = xor i32 %29, 0 + %31 = xor i32 %30, 0 + %32 = xor i32 %31, 0 + %33 = xor i32 %32, 0 + %34 = trunc i32 %33 to i8 + store i8 %34, ptr %1, align 1 + %35 = xor i32 0, %5 + %36 = xor i32 %35, 0 + %37 = xor i32 %36, 0 + %38 = xor i32 %37, 0 + %39 = xor i32 %38, 0 + %40 = xor i32 %39, %8 + %41 = xor i32 %40, 0 + %42 = xor i32 %41, 0 + %43 = xor i32 %42, 0 + %44 = xor i32 %43, 0 + %45 = trunc i32 %44 to i8 + store i8 %45, ptr %2, align 1 + %46 = xor i32 %35, 0 + %47 = xor i32 %46, 0 + %48 = xor i32 %47, 0 + %49 = xor i32 %48, 0 + %50 = xor i32 %49, %8 + %51 = xor i32 %50, 0 + %52 = xor i32 %51, 0 + %53 = xor i32 %52, 0 + %54 = xor i32 %53, 0 + %55 = trunc i32 %54 to i8 + store i8 %55, ptr %3, align 1 + ret i64 0 +} diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll index 87a70ccd3fc7c5..f57ec8d589821f 100644 --- a/llvm/test/Transforms/Scalarizer/basic.ll +++ b/llvm/test/Transforms/Scalarizer/basic.ll @@ -283,13 +283,13 @@ end: define <4 x float> @f6(<4 x float> %x) { ; CHECK-LABEL: @f6( ; CHECK-NEXT: [[X_I0:%.*]] = extractelement <4 x float> [[X:%.*]], i64 0 -; CHECK-NEXT: [[RES_I0:%.*]] = fadd float [[X_I0]], 1.000000e+00, !fpmath !9 +; CHECK-NEXT: [[RES_I0:%.*]] = fadd float [[X_I0]], 1.000000e+00, !fpmath [[META9:![0-9]+]] ; CHECK-NEXT: [[X_I1:%.*]] = extractelement <4 x float> [[X]], i64 1 -; CHECK-NEXT: [[RES_I1:%.*]] = fadd float [[X_I1]], 2.000000e+00, !fpmath !9 +; CHECK-NEXT: [[RES_I1:%.*]] = fadd float [[X_I1]], 2.000000e+00, !fpmath [[META9]] ; CHECK-NEXT: [[X_I2:%.*]] = extractelement <4 x float> [[X]], i64 2 -; CHECK-NEXT: [[RES_I2:%.*]] = fadd float [[X_I2]], 3.000000e+00, !fpmath !9 +; CHECK-NEXT: [[RES_I2:%.*]] = fadd float [[X_I2]], 3.000000e+00, !fpmath [[META9]] ; CHECK-NEXT: [[X_I3:%.*]] = extractelement <4 x float> [[X]], i64 3 -; CHECK-NEXT: [[RES_I3:%.*]] = fadd float [[X_I3]], 4.000000e+00, !fpmath !9 +; CHECK-NEXT: [[RES_I3:%.*]] = fadd float [[X_I3]], 4.000000e+00, !fpmath [[META9]] ; CHECK-NEXT: [[RES_UPTO0:%.*]] = insertelement <4 x float> poison, float [[RES_I0]], i64 0 ; CHECK-NEXT: [[RES_UPTO1:%.*]] = insertelement <4 x float> [[RES_UPTO0]], float [[RES_I1]], i64 1 ; CHECK-NEXT: [[RES_UPTO2:%.*]] = insertelement <4 x float> [[RES_UPTO1]], float [[RES_I2]], i64 2 @@ -865,6 +865,20 @@ define <2 x float> @f25(<2 x float> %src) { ret <2 x float> %mul } +define <2 x i8> @test_copy_trunc_flags(<2 x i32> %src) { +; CHECK-LABEL: @test_copy_trunc_flags( +; CHECK-NEXT: [[SRC_I0:%.*]] = extractelement <2 x i32> [[SRC:%.*]], i64 0 +; CHECK-NEXT: [[TRUNC_I0:%.*]] = trunc nuw nsw i32 [[SRC_I0]] to i8 +; CHECK-NEXT: [[SRC_I1:%.*]] = extractelement <2 x i32> [[SRC]], i64 1 +; CHECK-NEXT: [[TRUNC_I1:%.*]] = trunc nuw nsw i32 [[SRC_I1]] to i8 +; CHECK-NEXT: [[TRUNC_UPTO0:%.*]] = insertelement <2 x i8> poison, i8 [[TRUNC_I0]], i64 0 +; CHECK-NEXT: [[TRUNC:%.*]] = insertelement <2 x i8> [[TRUNC_UPTO0]], i8 [[TRUNC_I1]], i64 1 +; CHECK-NEXT: ret <2 x i8> [[TRUNC]] +; + %trunc = trunc nuw nsw <2 x i32> %src to <2 x i8> + ret <2 x i8> %trunc +} + !0 = !{ !"root" } !1 = !{ !"set1", !0 } !2 = !{ !"set2", !0 } diff --git a/llvm/test/tools/llvm-readobj/ELF/relr-relocs.test b/llvm/test/tools/llvm-readobj/ELF/relr-relocs.test index 91b148ebb6e3c8..9b59f991c051e2 100644 --- a/llvm/test/tools/llvm-readobj/ELF/relr-relocs.test +++ b/llvm/test/tools/llvm-readobj/ELF/relr-relocs.test @@ -47,29 +47,39 @@ # RAW-GNU1-NEXT: 00000000000f0501 # RAW-GNU1-NEXT: 000a700550400009 -# RUN: llvm-readelf --relocations %t1 | FileCheck --check-prefix=GNU1 %s -# GNU1: Relocation section '.relr.dyn' at offset 0x40 contains 21 entries: -# GNU1: 0000000000010d60 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000010d68 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000010da0 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020000 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020040 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020050 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020080 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020088 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020090 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020098 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020210 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 00000000000202a8 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 00000000000202d8 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 00000000000202e8 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 00000000000202f8 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020308 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020358 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020360 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020368 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020380 0000000000000008 R_X86_64_RELATIVE -# GNU1-NEXT: 0000000000020390 0000000000000008 R_X86_64_RELATIVE +# RUN: llvm-readelf --relocations %t1 | FileCheck --check-prefix=GNU1 --match-full-lines --strict-whitespace %s +# GNU1:Relocation section '.relr.dyn' at offset 0x40 contains 21 entries: +# GNU1-NEXT:Index: Entry Address Symbolic Address +# GNU1-NEXT:0000: 0000000000010d60 0000000000010d60 +# GNU1-NEXT:0001: 0000000000000103 0000000000010d68 +# GNU1-NEXT: 0000000000010da0 base + 0x30 +# GNU1-NEXT:0002: 0000000000020000 0000000000020000 foo +# GNU1-NEXT:0003: 00000000000f0501 0000000000020040 foo + 0x40 +# GNU1-NEXT: 0000000000020050 foo + 0x50 +# GNU1-NEXT: 0000000000020080 foo + 0x80 +# GNU1-NEXT: 0000000000020088 foo + 0x88 +# GNU1-NEXT: 0000000000020090 foo + 0x90 +# GNU1-NEXT: 0000000000020098 foo + 0x98 +# GNU1-NEXT:0004: 000a700550400009 0000000000020210 bar + 0x10 +# GNU1-NEXT: 00000000000202a8 bar + 0xa8 +# GNU1-NEXT: 00000000000202d8 bar + 0xd8 +# GNU1-NEXT: 00000000000202e8 bar + 0xe8 +# GNU1-NEXT: 00000000000202f8 bar + 0xf8 +# GNU1-NEXT: 0000000000020308 bar + 0x108 +# GNU1-NEXT: 0000000000020358 bar + 0x158 +# GNU1-NEXT: 0000000000020360 bar + 0x160 +# GNU1-NEXT: 0000000000020368 bar + 0x168 +# GNU1-NEXT: 0000000000020380 bar + 0x180 +# GNU1-NEXT: 0000000000020390 bar + 0x190 +# GNU1-NOT:{{.}} + +## The addresses are not symbolized in the absence of .symtab. +# RUN: llvm-objcopy --strip-all %t1 %t1.stripped +# RUN: llvm-readelf --relocations %t1.stripped | FileCheck --check-prefix=GNU1S --match-full-lines --strict-whitespace %s +# GNU1S:Relocation section '.relr.dyn' at offset 0x40 contains 21 entries: +# GNU1S-NEXT:Index: Entry Address Symbolic Address +# GNU1S-NEXT:0000: 0000000000010d60 0000000000010d60 +# GNU1S-NEXT:0001: 0000000000000103 0000000000010d68 --- !ELF FileHeader: @@ -83,6 +93,18 @@ Sections: Flags: [ SHF_ALLOC ] Entries: [ 0x0000000000010D60, 0x0000000000000103, 0x0000000000020000, 0x00000000000F0501, 0x000A700550400009 ] +Symbols: + - Name: bar + Value: 0x20200 + Section: .relr.dyn + - Name: foo + Value: 0x20000 + Section: .relr.dyn + - Name: base + Value: 0x10d70 + Section: .relr.dyn + - Name: ignored + Value: 0x20210 # RUN: yaml2obj --docnum=2 %s -o %t2 # RUN: llvm-readobj --relocations --raw-relr %t2 | \ @@ -124,22 +146,24 @@ Sections: # RAW-GNU2-NEXT: 000f0501 # RAW-GNU2-NEXT: 50400009 -# RUN: llvm-readelf --relocations %t2 | FileCheck --check-prefix=GNU2 %s -# GNU2: Relocation section '.relr.dyn' at offset 0x34 contains 14 entries: -# GNU2: 00010d60 00000008 R_386_RELATIVE -# GNU2-NEXT: 00010d64 00000008 R_386_RELATIVE -# GNU2-NEXT: 00010d80 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020000 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020020 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020028 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020040 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020044 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020048 00000008 R_386_RELATIVE -# GNU2-NEXT: 0002004c 00000008 R_386_RELATIVE -# GNU2-NEXT: 00020088 00000008 R_386_RELATIVE -# GNU2-NEXT: 000200d4 00000008 R_386_RELATIVE -# GNU2-NEXT: 000200ec 00000008 R_386_RELATIVE -# GNU2-NEXT: 000200f4 00000008 R_386_RELATIVE +# RUN: llvm-readelf --relocations %t2 | FileCheck --check-prefix=GNU2 --match-full-lines --strict-whitespace %s +# GNU2:Relocation section '.relr.dyn' at offset 0x34 contains 14 entries: +# GNU2-NEXT:Index: Entry Address Symbolic Address +# GNU2-NEXT:0000: 00010d60 00010d60 .relr.dyn +# GNU2-NEXT:0001: 00000103 00010d64 .relr.dyn + 0x4 +# GNU2-NEXT: 00010d80 .relr.dyn + 0x20 +# GNU2-NEXT:0002: 00020000 00020000 .relr.dyn + 0xf2a0 +# GNU2-NEXT:0003: 000f0501 00020020 .relr.dyn + 0xf2c0 +# GNU2-NEXT: 00020028 .relr.dyn + 0xf2c8 +# GNU2-NEXT: 00020040 .relr.dyn + 0xf2e0 +# GNU2-NEXT: 00020044 .relr.dyn + 0xf2e4 +# GNU2-NEXT: 00020048 .relr.dyn + 0xf2e8 +# GNU2-NEXT: 0002004c .relr.dyn + 0xf2ec +# GNU2-NEXT:0004: 50400009 00020088 .relr.dyn + 0xf328 +# GNU2-NEXT: 000200d4 .relr.dyn + 0xf374 +# GNU2-NEXT: 000200ec .relr.dyn + 0xf38c +# GNU2-NEXT: 000200f4 .relr.dyn + 0xf394 +# GNU2-NOT:{{.}} --- !ELF FileHeader: @@ -156,6 +180,11 @@ Sections: EntSize: [[ENTSIZE=]] ShType: [[SHTYPE=]] Link: [[LINK=]] +Symbols: + - Name: .relr.dyn + Type: STT_SECTION + Value: 0x10D60 + Section: .relr.dyn ## Check we report a warning when we are unable to dump relocations ## for a SHT_RELR/SHT_ANDROID_RELR/SHT_AARCH64_AUTH_RELR section. @@ -175,7 +204,6 @@ Sections: # BROKEN-GNU: warning: '[[FILE]]': unable to get the number of relocations in [[SECNAME]] section with index 1: section [index 1] has invalid sh_entsize: expected 4, but got 1 # BROKEN-GNU: Relocation section '.relr.dyn' at offset 0x34 contains entries: -# BROKEN-GNU-NEXT: Offset Info Type Sym. Value Symbol's Name # BROKEN-GNU-NEXT: warning: '[[FILE]]': unable to read relocations from [[SECNAME]] section with index 1: section [index 1] has invalid sh_entsize: expected 4, but got 1 ## Case B: check the case when relocations can't be read from an SHT_ANDROID_RELR section. @@ -218,3 +246,36 @@ Sections: # RUN: FileCheck -DFILE=%t2.has.link --check-prefix=RAW-LLVM2 %s # RUN: llvm-readelf --relocations --raw-relr %t2.has.link 2>&1 | \ # RUN: FileCheck -DFILE=%t2.has.link --check-prefix=RAW-GNU2 %s + +## .symtab is invalid. Check we report a warning and print entries without symbolization. +# RUN: yaml2obj --docnum=3 -DENTSIZE=1 %s -o %t3.err1 +# RUN: llvm-readelf -r %t3.err1 2>&1 | FileCheck -DFILE=%t3.err1 --check-prefixes=GNU3,GNU3-ERR1 --match-full-lines %s +# RUN: yaml2obj --docnum=3 -DLINK=0xff %s -o %t3.err2 +# RUN: llvm-readelf -r %t3.err2 2>&1 | FileCheck -DFILE=%t3.err2 --check-prefixes=GNU3,GNU3-ERR2 --match-full-lines %s + +# GNU3:Index: Entry Address Symbolic Address +# GNU3-ERR1-NEXT:{{.*}}warning: '[[FILE]]': unable to read symbols from the SHT_SYMTAB section: section [index 2] has invalid sh_entsize: expected 24, but got 1 +# GNU3-ERR2-NEXT:{{.*}}warning: '[[FILE]]': unable to get the string table for the SHT_SYMTAB section: invalid section index: 255 +# GNU3-NEXT:0000: 0000000000010d60 0000000000010d60 +# GNU3-NEXT:0001: 0000000000000103 0000000000010d68 +# GNU3-NEXT: 0000000000010da0 +# GNU3-NOT:{{.}} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .relr.dyn + Type: SHT_RELR + Flags: [ SHF_ALLOC ] + Entries: [ 0x0000000000010D60, 0x0000000000000103 ] + - Name: .symtab + Type: SHT_SYMTAB + Link: [[LINK=.strtab]] + EntSize: [[ENTSIZE=0x18]] +Symbols: + - Name: bar + Value: 0x10D60 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 1108672003fc76..f145653ac743c5 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -411,6 +411,7 @@ template class ELFDumper : public ObjDumper { std::string getStaticSymbolName(uint32_t Index) const; StringRef getDynamicString(uint64_t Value) const; + std::pair> getSymtabAndStrtab() const; void printSymbolsHelper(bool IsDynamic, bool ExtraSymInfo) const; std::string getDynamicEntry(uint64_t Type, uint64_t Value) const; @@ -512,6 +513,28 @@ ELFDumper::getVersionTable(const Elf_Shdr &Sec, ArrayRef *SymTab, return *VersionsOrErr; } +template +std::pair::Elf_Sym_Range, std::optional> +ELFDumper::getSymtabAndStrtab() const { + assert(DotSymtabSec); + Elf_Sym_Range Syms(nullptr, nullptr); + std::optional StrTable; + if (Expected StrTableOrErr = + Obj.getStringTableForSymtab(*DotSymtabSec)) + StrTable = *StrTableOrErr; + else + reportUniqueWarning( + "unable to get the string table for the SHT_SYMTAB section: " + + toString(StrTableOrErr.takeError())); + + if (Expected SymsOrErr = Obj.symbols(DotSymtabSec)) + Syms = *SymsOrErr; + else + reportUniqueWarning("unable to read symbols from the SHT_SYMTAB section: " + + toString(SymsOrErr.takeError())); + return {Syms, StrTable}; +} + template void ELFDumper::printSymbolsHelper(bool IsDynamic, bool ExtraSymInfo) const { @@ -525,20 +548,7 @@ void ELFDumper::printSymbolsHelper(bool IsDynamic, Syms = dynamic_symbols(); Entries = Syms.size(); } else if (DotSymtabSec) { - if (Expected StrTableOrErr = - Obj.getStringTableForSymtab(*DotSymtabSec)) - StrTable = *StrTableOrErr; - else - reportUniqueWarning( - "unable to get the string table for the SHT_SYMTAB section: " + - toString(StrTableOrErr.takeError())); - - if (Expected SymsOrErr = Obj.symbols(DotSymtabSec)) - Syms = *SymsOrErr; - else - reportUniqueWarning( - "unable to read symbols from the SHT_SYMTAB section: " + - toString(SymsOrErr.takeError())); + std::tie(Syms, StrTable) = getSymtabAndStrtab(); Entries = DotSymtabSec->getEntityCount(); } if (Syms.empty()) @@ -658,6 +668,7 @@ template class GNUELFDumper : public ELFDumper { void printHashedSymbol(const Elf_Sym *Sym, unsigned SymIndex, DataRegion ShndxTable, StringRef StrTable, uint32_t Bucket); + void printRelr(const Elf_Shdr &Sec); void printRelrReloc(const Elf_Relr &R) override; void printRelRelaReloc(const Relocation &R, const RelSymbol &RelSym) override; @@ -3882,6 +3893,12 @@ static bool isRelocationSec(const typename ELFT::Shdr &Sec, } template void GNUELFDumper::printRelocations() { + auto PrintAsRelr = [&](const Elf_Shdr &Sec) { + return !opts::RawRelr && (Sec.sh_type == ELF::SHT_RELR || + Sec.sh_type == ELF::SHT_ANDROID_RELR || + (this->Obj.getHeader().e_machine == EM_AARCH64 && + Sec.sh_type == ELF::SHT_AARCH64_AUTH_RELR)); + }; auto GetEntriesNum = [&](const Elf_Shdr &Sec) -> Expected { // Android's packed relocation section needs to be unpacked first // to get the actual number of entries. @@ -3894,10 +3911,7 @@ template void GNUELFDumper::printRelocations() { return RelasOrErr->size(); } - if (!opts::RawRelr && - (Sec.sh_type == ELF::SHT_RELR || Sec.sh_type == ELF::SHT_ANDROID_RELR || - (this->Obj.getHeader().e_machine == EM_AARCH64 && - Sec.sh_type == ELF::SHT_AARCH64_AUTH_RELR))) { + if (PrintAsRelr(Sec)) { Expected RelrsOrErr = this->Obj.relrs(Sec); if (!RelrsOrErr) return RelrsOrErr.takeError(); @@ -3926,13 +3940,89 @@ template void GNUELFDumper::printRelocations() { OS << "\nRelocation section '" << Name << "' at offset 0x" << utohexstr(Offset, /*LowerCase=*/true) << " contains " << EntriesNum << " entries:\n"; - printRelocHeaderFields(OS, Sec.sh_type, this->Obj.getHeader()); - this->printRelocationsHelper(Sec); + + if (PrintAsRelr(Sec)) { + printRelr(Sec); + } else { + printRelocHeaderFields(OS, Sec.sh_type, this->Obj.getHeader()); + this->printRelocationsHelper(Sec); + } } if (!HasRelocSections) OS << "\nThere are no relocations in this file.\n"; } +template void GNUELFDumper::printRelr(const Elf_Shdr &Sec) { + Expected RangeOrErr = this->Obj.relrs(Sec); + if (!RangeOrErr) { + this->reportUniqueWarning("unable to read relocations from " + + this->describe(Sec) + ": " + + toString(RangeOrErr.takeError())); + return; + } + if (ELFT::Is64Bits) + OS << "Index: Entry Address Symbolic Address\n"; + else + OS << "Index: Entry Address Symbolic Address\n"; + + // If .symtab is available, collect its defined symbols and sort them by + // st_value. + SmallVector, 0> Syms; + if (this->DotSymtabSec) { + Elf_Sym_Range Symtab; + std::optional Strtab; + std::tie(Symtab, Strtab) = this->getSymtabAndStrtab(); + if (Symtab.size() && Strtab) { + for (auto [I, Sym] : enumerate(Symtab)) { + if (!Sym.st_shndx) + continue; + Syms.emplace_back(Sym.st_value, + this->getFullSymbolName(Sym, I, ArrayRef(), + *Strtab, false)); + } + } + } + llvm::stable_sort(Syms); + + typename ELFT::uint Base = 0; + size_t I = 0; + auto Print = [&](uint64_t Where) { + OS << format_hex_no_prefix(Where, ELFT::Is64Bits ? 16 : 8); + for (; I < Syms.size() && Syms[I].first <= Where; ++I) + ; + // Try symbolizing the address. Find the nearest symbol before or at the + // address and print the symbol and the address difference. + if (I) { + OS << " " << Syms[I - 1].second; + if (Syms[I - 1].first < Where) + OS << " + 0x" << Twine::utohexstr(Where - Syms[I - 1].first); + } + OS << '\n'; + }; + for (auto [Index, R] : enumerate(*RangeOrErr)) { + typename ELFT::uint Entry = R; + OS << formatv("{0:4}: ", Index) + << format_hex_no_prefix(Entry, ELFT::Is64Bits ? 16 : 8) << ' '; + if ((Entry & 1) == 0) { + Print(Entry); + Base = Entry + sizeof(typename ELFT::uint); + } else { + bool First = true; + for (auto Where = Base; Entry >>= 1; + Where += sizeof(typename ELFT::uint)) { + if (Entry & 1) { + if (First) + First = false; + else + OS.indent(ELFT::Is64Bits ? 24 : 16); + Print(Where); + } + } + Base += (CHAR_BIT * sizeof(Entry) - 1) * sizeof(typename ELFT::uint); + } + } +} + // Print the offset of a particular section from anyone of the ranges: // [SHT_LOOS, SHT_HIOS], [SHT_LOPROC, SHT_HIPROC], [SHT_LOUSER, SHT_HIUSER]. // If 'Type' does not fall within any of those ranges, then a string is diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp index 8049517acc9fa0..feb0231300f1fd 100644 --- a/llvm/unittests/Support/KnownBitsTest.cpp +++ b/llvm/unittests/Support/KnownBitsTest.cpp @@ -19,23 +19,11 @@ using namespace llvm; using UnaryBitsFn = llvm::function_ref; using UnaryIntFn = llvm::function_ref(const APInt &)>; -using UnaryCheckFn = llvm::function_ref; using BinaryBitsFn = llvm::function_ref; using BinaryIntFn = llvm::function_ref(const APInt &, const APInt &)>; -using BinaryCheckFn = - llvm::function_ref; - -static bool checkOptimalityUnary(const KnownBits &) { return true; } -static bool checkCorrectnessOnlyUnary(const KnownBits &) { return false; } -static bool checkOptimalityBinary(const KnownBits &, const KnownBits &) { - return true; -} -static bool checkCorrectnessOnlyBinary(const KnownBits &, const KnownBits &) { - return false; -} static testing::AssertionResult isCorrect(const KnownBits &Exact, const KnownBits &Computed, @@ -66,9 +54,8 @@ static testing::AssertionResult isOptimal(const KnownBits &Exact, return Result; } -static void -testUnaryOpExhaustive(UnaryBitsFn BitsFn, UnaryIntFn IntFn, - UnaryCheckFn CheckOptimalityFn = checkOptimalityUnary) { +static void testUnaryOpExhaustive(UnaryBitsFn BitsFn, UnaryIntFn IntFn, + bool CheckOptimality = true) { for (unsigned Bits : {1, 4}) { ForeachKnownBits(Bits, [&](const KnownBits &Known) { KnownBits Computed = BitsFn(Known); @@ -87,17 +74,16 @@ testUnaryOpExhaustive(UnaryBitsFn BitsFn, UnaryIntFn IntFn, EXPECT_TRUE(isCorrect(Exact, Computed, Known)); // We generally don't want to return conflicting known bits, even if it is // legal for always poison results. - if (CheckOptimalityFn(Known) && !Exact.hasConflict()) { + if (CheckOptimality && !Exact.hasConflict()) { EXPECT_TRUE(isOptimal(Exact, Computed, Known)); } }); } } -static void -testBinaryOpExhaustive(BinaryBitsFn BitsFn, BinaryIntFn IntFn, - BinaryCheckFn CheckOptimalityFn = checkOptimalityBinary, - bool RefinePoisonToZero = false) { +static void testBinaryOpExhaustive(BinaryBitsFn BitsFn, BinaryIntFn IntFn, + bool CheckOptimality = true, + bool RefinePoisonToZero = false) { for (unsigned Bits : {1, 4}) { ForeachKnownBits(Bits, [&](const KnownBits &Known1) { ForeachKnownBits(Bits, [&](const KnownBits &Known2) { @@ -119,7 +105,7 @@ testBinaryOpExhaustive(BinaryBitsFn BitsFn, BinaryIntFn IntFn, EXPECT_TRUE(isCorrect(Exact, Computed, {Known1, Known2})); // We generally don't want to return conflicting known bits, even if it // is legal for always poison results. - if (CheckOptimalityFn(Known1, Known2) && !Exact.hasConflict()) { + if (CheckOptimality && !Exact.hasConflict()) { EXPECT_TRUE(isOptimal(Exact, Computed, {Known1, Known2})); } // In some cases we choose to return zero if the result is always @@ -325,7 +311,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.udiv(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::udiv(Known1, Known2, /*Exact*/ true); @@ -335,7 +321,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.udiv(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::sdiv(Known1, Known2); @@ -345,7 +331,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.sdiv(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::sdiv(Known1, Known2, /*Exact*/ true); @@ -356,7 +342,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.sdiv(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::urem, [](const APInt &N1, const APInt &N2) -> std::optional { @@ -364,7 +350,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.urem(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::srem, [](const APInt &N1, const APInt &N2) -> std::optional { @@ -372,31 +358,31 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.srem(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::sadd_sat, [](const APInt &N1, const APInt &N2) -> std::optional { return N1.sadd_sat(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::uadd_sat, [](const APInt &N1, const APInt &N2) -> std::optional { return N1.uadd_sat(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::ssub_sat, [](const APInt &N1, const APInt &N2) -> std::optional { return N1.ssub_sat(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::usub_sat, [](const APInt &N1, const APInt &N2) -> std::optional { return N1.usub_sat(N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::shl(Known1, Known2); @@ -406,7 +392,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.shl(N2); }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::shl(Known1, Known2, /* NUW */ true); @@ -418,7 +404,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return Res; }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::shl(Known1, Known2, /* NUW */ false, /* NSW */ true); @@ -430,7 +416,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return Res; }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::shl(Known1, Known2, /* NUW */ true, /* NSW */ true); @@ -443,7 +429,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return Res; }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { @@ -454,7 +440,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.lshr(N2); }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::lshr(Known1, Known2, /*ShAmtNonZero=*/false, @@ -467,7 +453,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.lshr(N2); }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::ashr(Known1, Known2); @@ -477,7 +463,7 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.ashr(N2); }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::ashr(Known1, Known2, /*ShAmtNonZero=*/false, @@ -490,21 +476,21 @@ TEST(KnownBitsTest, BinaryExhaustive) { return std::nullopt; return N1.ashr(N2); }, - checkOptimalityBinary, /* RefinePoisonToZero */ true); + /*CheckOptimality=*/true, /* RefinePoisonToZero */ true); testBinaryOpExhaustive( [](const KnownBits &Known1, const KnownBits &Known2) { return KnownBits::mul(Known1, Known2); }, [](const APInt &N1, const APInt &N2) { return N1 * N2; }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::mulhs, [](const APInt &N1, const APInt &N2) { return APIntOps::mulhs(N1, N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); testBinaryOpExhaustive( KnownBits::mulhu, [](const APInt &N1, const APInt &N2) { return APIntOps::mulhu(N1, N2); }, - checkCorrectnessOnlyBinary); + /*CheckOptimality=*/false); } TEST(KnownBitsTest, UnaryExhaustive) { @@ -527,7 +513,7 @@ TEST(KnownBitsTest, UnaryExhaustive) { [](const KnownBits &Known) { return KnownBits::mul(Known, Known, /*SelfMultiply*/ true); }, - [](const APInt &N) { return N * N; }, checkCorrectnessOnlyUnary); + [](const APInt &N) { return N * N; }, /*CheckOptimality=*/false); } TEST(KnownBitsTest, WideShifts) { diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp index e9b4ac3d92e1dd..e9fd9671ea6ab5 100644 --- a/llvm/unittests/Support/VirtualFileSystemTest.cpp +++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp @@ -101,7 +101,7 @@ class DummyFileSystem : public vfs::FileSystem { std::map::iterator I; std::string Path; bool isInPath(StringRef S) { - if (Path.size() < S.size() && S.find(Path) == 0) { + if (Path.size() < S.size() && S.starts_with(Path)) { auto LastSep = S.find_last_of('/'); if (LastSep == Path.size() || LastSep == Path.size() - 1) return true; diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp index 63b827a35177e3..f61a05861981c5 100644 --- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp +++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp @@ -55,11 +55,11 @@ class MacroFusionPredicatorEmitter { RecordKeeper &Records; CodeGenTarget Target; - void emitMacroFusionDecl(std::vector Fusions, PredicateExpander &PE, + void emitMacroFusionDecl(ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS); - void emitMacroFusionImpl(std::vector Fusions, PredicateExpander &PE, + void emitMacroFusionImpl(ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS); - void emitPredicates(std::vector &FirstPredicate, bool IsCommutable, + void emitPredicates(ArrayRef FirstPredicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS); void emitFirstPredicate(Record *SecondPredicate, bool IsCommutable, PredicateExpander &PE, raw_ostream &OS); @@ -76,7 +76,7 @@ class MacroFusionPredicatorEmitter { } // End anonymous namespace. void MacroFusionPredicatorEmitter::emitMacroFusionDecl( - std::vector Fusions, PredicateExpander &PE, raw_ostream &OS) { + ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { OS << "#ifdef GET_" << Target.getName() << "_MACRO_FUSION_PRED_DECL\n"; OS << "#undef GET_" << Target.getName() << "_MACRO_FUSION_PRED_DECL\n\n"; OS << "namespace llvm {\n"; @@ -93,7 +93,7 @@ void MacroFusionPredicatorEmitter::emitMacroFusionDecl( } void MacroFusionPredicatorEmitter::emitMacroFusionImpl( - std::vector Fusions, PredicateExpander &PE, raw_ostream &OS) { + ArrayRef Fusions, PredicateExpander &PE, raw_ostream &OS) { OS << "#ifdef GET_" << Target.getName() << "_MACRO_FUSION_PRED_IMPL\n"; OS << "#undef GET_" << Target.getName() << "_MACRO_FUSION_PRED_IMPL\n\n"; OS << "namespace llvm {\n"; @@ -121,9 +121,10 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl( OS << "\n#endif\n"; } -void MacroFusionPredicatorEmitter::emitPredicates( - std::vector &Predicates, bool IsCommutable, PredicateExpander &PE, - raw_ostream &OS) { +void MacroFusionPredicatorEmitter::emitPredicates(ArrayRef Predicates, + bool IsCommutable, + PredicateExpander &PE, + raw_ostream &OS) { for (Record *Predicate : Predicates) { Record *Target = Predicate->getValueAsDef("Target"); if (Target->getName() == "first_fusion_target") diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp index e57bc6fb507e32..62916bd62c0119 100644 --- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp @@ -82,7 +82,6 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) { OS << "#ifndef TUNE_PROC\n" << "#define TUNE_PROC(ENUM, NAME)\n" << "#endif\n\n"; - OS << "TUNE_PROC(GENERIC, \"generic\")\n"; for (const Record *Rec : RK.getAllDerivedDefinitions("RISCVTuneProcessorModel")) { diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 0e3674751fde6a..1440863ea5618f 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -15,7 +15,6 @@ #include "Common/CodeGenTarget.h" #include "X86RecognizableInstr.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/FormattedStream.h" #include "llvm/Support/X86FoldTablesUtils.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" @@ -95,7 +94,7 @@ class X86FoldTablesEmitter { const CodeGenInstruction *MemInst) : RegInst(RegInst), MemInst(MemInst) {} - void print(formatted_raw_ostream &OS) const { + void print(raw_ostream &OS) const { OS.indent(2); OS << "{X86::" << RegInst->TheDef->getName() << ", "; OS << "X86::" << MemInst->TheDef->getName() << ", "; @@ -222,7 +221,7 @@ class X86FoldTablesEmitter { // Print the given table as a static const C++ array of type // X86FoldTableEntry. void printTable(const FoldTable &Table, StringRef TableName, - formatted_raw_ostream &OS) { + raw_ostream &OS) { OS << "static const X86FoldTableEntry " << TableName << "[] = {\n"; for (auto &E : Table) @@ -619,9 +618,7 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInst, } } -void X86FoldTablesEmitter::run(raw_ostream &O) { - formatted_raw_ostream OS(O); - +void X86FoldTablesEmitter::run(raw_ostream &OS) { // Holds all memory instructions std::vector MemInsts; // Holds all register instructions - divided according to opcode. diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md index 84e6664436d7b3..6a18e06593e8e9 100644 --- a/mlir/docs/Passes.md +++ b/mlir/docs/Passes.md @@ -119,3 +119,7 @@ This document describes the available MLIR passes and their contracts. ## TOSA Dialect Passes [include "TosaPasses.md"] + +## XeGPU Dialect Passes + +[include "XeGPUPasses.md"] diff --git a/mlir/examples/transform/Ch4/lib/MyExtension.cpp b/mlir/examples/transform/Ch4/lib/MyExtension.cpp index 26e348f2a30ec6..83e2dcd750bb39 100644 --- a/mlir/examples/transform/Ch4/lib/MyExtension.cpp +++ b/mlir/examples/transform/Ch4/lib/MyExtension.cpp @@ -142,7 +142,7 @@ mlir::transform::HasOperandSatisfyingOp::apply( transform::detail::prepareValueMappings( yieldedMappings, getBody().front().getTerminator()->getOperands(), state); - results.setParams(getPosition().cast(), + results.setParams(cast(getPosition()), {rewriter.getI32IntegerAttr(operand.getOperandNumber())}); for (auto &&[result, mapping] : llvm::zip(getResults(), yieldedMappings)) results.setMappedValues(result, mapping); diff --git a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h index 0891e2ba7be760..7ffc8613317603 100644 --- a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h +++ b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h @@ -31,11 +31,6 @@ convertArithFastMathAttrToLLVM(arith::FastMathFlagsAttr fmfAttr); LLVM::IntegerOverflowFlags convertArithOverflowFlagsToLLVM(arith::IntegerOverflowFlags arithFlags); -/// Creates an LLVM overflow attribute from a given arithmetic overflow -/// attribute. -LLVM::IntegerOverflowFlagsAttr -convertArithOverflowAttrToLLVM(arith::IntegerOverflowFlagsAttr flagsAttr); - /// Creates an LLVM rounding mode enum value from a given arithmetic rounding /// mode enum value. LLVM::RoundingMode @@ -72,6 +67,9 @@ class AttrConvertFastMathToLLVM { } ArrayRef getAttrs() const { return convertedAttr.getAttrs(); } + LLVM::IntegerOverflowFlags getOverflowFlags() const { + return LLVM::IntegerOverflowFlags::none; + } private: NamedAttrList convertedAttr; @@ -89,19 +87,18 @@ class AttrConvertOverflowToLLVM { // Get the name of the arith overflow attribute. StringRef arithAttrName = SourceOp::getIntegerOverflowAttrName(); // Remove the source overflow attribute. - auto arithAttr = dyn_cast_if_present( - convertedAttr.erase(arithAttrName)); - if (arithAttr) { - StringRef targetAttrName = TargetOp::getIntegerOverflowAttrName(); - convertedAttr.set(targetAttrName, - convertArithOverflowAttrToLLVM(arithAttr)); + if (auto arithAttr = dyn_cast_if_present( + convertedAttr.erase(arithAttrName))) { + overflowFlags = convertArithOverflowFlagsToLLVM(arithAttr.getValue()); } } ArrayRef getAttrs() const { return convertedAttr.getAttrs(); } + LLVM::IntegerOverflowFlags getOverflowFlags() const { return overflowFlags; } private: NamedAttrList convertedAttr; + LLVM::IntegerOverflowFlags overflowFlags = LLVM::IntegerOverflowFlags::none; }; template @@ -132,6 +129,9 @@ class AttrConverterConstrainedFPToLLVM { } ArrayRef getAttrs() const { return convertedAttr.getAttrs(); } + LLVM::IntegerOverflowFlags getOverflowFlags() const { + return LLVM::IntegerOverflowFlags::none; + } private: NamedAttrList convertedAttr; diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h index f362167ee93249..f3bf5b66398e09 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h @@ -11,6 +11,7 @@ #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Transforms/DialectConversion.h" namespace mlir { @@ -18,13 +19,16 @@ class CallOpInterface; namespace LLVM { namespace detail { +/// Handle generically setting flags as native properties on LLVM operations. +void setNativeProperties(Operation *op, IntegerOverflowFlags overflowFlags); + /// Replaces the given operation "op" with a new operation of type "targetOp" /// and given operands. -LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - ArrayRef targetAttrs, - const LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter); +LogicalResult oneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + ArrayRef targetAttrs, + const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter, + IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none); } // namespace detail } // namespace LLVM diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h index 279175b6128fc7..964281592cc65e 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h @@ -54,11 +54,11 @@ LogicalResult handleMultidimensionalVectors( std::function createOperand, ConversionPatternRewriter &rewriter); -LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - ArrayRef targetAttrs, - const LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter); +LogicalResult vectorOneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + ArrayRef targetAttrs, + const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter, + IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none); } // namespace detail } // namespace LLVM @@ -70,6 +70,9 @@ class AttrConvertPassThrough { AttrConvertPassThrough(SourceOp srcOp) : srcAttrs(srcOp->getAttrs()) {} ArrayRef getAttrs() const { return srcAttrs; } + LLVM::IntegerOverflowFlags getOverflowFlags() const { + return LLVM::IntegerOverflowFlags::none; + } private: ArrayRef srcAttrs; @@ -100,7 +103,8 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern { return LLVM::detail::vectorOneToOneRewrite( op, TargetOp::getOperationName(), adaptor.getOperands(), - attrConvert.getAttrs(), *this->getTypeConverter(), rewriter); + attrConvert.getAttrs(), *this->getTypeConverter(), rewriter, + attrConvert.getOverflowFlags()); } }; } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Async/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Async/IR/CMakeLists.txt index ebbf2df760faa4..2525eee2a34ec9 100644 --- a/mlir/include/mlir/Dialect/Async/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Async/IR/CMakeLists.txt @@ -1,2 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS AsyncOps.td) add_mlir_dialect(AsyncOps async) add_mlir_doc(AsyncOps AsyncDialect Dialects/ -gen-dialect-doc) diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h index c03915667db653..5d9531cd124154 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h @@ -31,6 +31,9 @@ namespace mlir { namespace emitc { void buildTerminatedBody(OpBuilder &builder, Location loc); +/// Determines whether \p type is valid in EmitC. +bool isSupportedEmitCType(mlir::Type type); + /// Determines whether \p type is a valid integer type in EmitC. bool isSupportedIntegerType(mlir::Type type); diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td index e611fd2f0f15c4..c1a1e77c34ab25 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td @@ -34,16 +34,16 @@ class EmitC_Op traits = []> // Base class for unary operations. class EmitC_UnaryOp traits = []> : EmitC_Op { - let arguments = (ins AnyType); - let results = (outs AnyType); + let arguments = (ins EmitCType); + let results = (outs EmitCType); let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } // Base class for binary operations. class EmitC_BinaryOp traits = []> : EmitC_Op { - let arguments = (ins AnyType:$lhs, AnyType:$rhs); - let results = (outs AnyType); + let arguments = (ins EmitCType:$lhs, EmitCType:$rhs); + let results = (outs EmitCType); let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } @@ -97,9 +97,9 @@ def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> { }]; let arguments = (ins Arg:$applicableOperator, - AnyType:$operand + EmitCType:$operand ); - let results = (outs AnyType:$result); + let results = (outs EmitCType:$result); let assemblyFormat = [{ $applicableOperator `(` $operand `)` attr-dict `:` functional-type($operand, results) }]; @@ -240,9 +240,9 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> { Arg:$callee, Arg, "the order of operands and further attributes">:$args, Arg, "template arguments">:$template_args, - Variadic:$operands + Variadic:$operands ); - let results = (outs Variadic); + let results = (outs Variadic); let builders = [ OpBuilder<(ins "::mlir::TypeRange":$resultTypes, @@ -284,8 +284,8 @@ def EmitC_CastOp : EmitC_Op<"cast", ``` }]; - let arguments = (ins AnyType:$source); - let results = (outs AnyType:$dest); + let arguments = (ins EmitCType:$source); + let results = (outs EmitCType:$dest); let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)"; } @@ -323,9 +323,9 @@ def EmitC_CmpOp : EmitC_BinaryOp<"cmp", [CExpression]> { }]; let arguments = (ins EmitC_CmpPredicateAttr:$predicate, - AnyType:$lhs, - AnyType:$rhs); - let results = (outs AnyType); + EmitCType:$lhs, + EmitCType:$rhs); + let results = (outs EmitCType); let assemblyFormat = "$predicate `,` operands attr-dict `:` functional-type(operands, results)"; } @@ -353,7 +353,7 @@ def EmitC_ConstantOp : EmitC_Op<"constant", [ConstantLike]> { }]; let arguments = (ins EmitC_OpaqueOrTypedAttr:$value); - let results = (outs AnyType); + let results = (outs EmitCType); let hasFolder = 1; let hasVerifier = 1; @@ -423,7 +423,7 @@ def EmitC_ExpressionOp : EmitC_Op<"expression", }]; let arguments = (ins UnitAttr:$do_not_inline); - let results = (outs AnyType:$result); + let results = (outs EmitCType:$result); let regions = (region SizedRegion<1>:$region); let hasVerifier = 1; @@ -531,8 +531,8 @@ def EmitC_CallOp : EmitC_Op<"call", %2 = emitc.call @my_add(%0, %1) : (f32, f32) -> f32 ``` }]; - let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); - let results = (outs Variadic); + let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); + let results = (outs Variadic); let builders = [ OpBuilder<(ins "FuncOp":$callee, CArg<"ValueRange", "{}">:$operands), [{ @@ -722,7 +722,7 @@ def EmitC_ReturnOp : EmitC_Op<"return", [Pure, HasParent<"FuncOp">, } ``` }]; - let arguments = (ins Optional:$operand); + let arguments = (ins Optional:$operand); let assemblyFormat = "attr-dict ($operand^ `:` type($operand))?"; let hasVerifier = 1; @@ -766,7 +766,7 @@ def EmitC_LiteralOp : EmitC_Op<"literal", [Pure]> { }]; let arguments = (ins StrAttr:$value); - let results = (outs AnyType:$result); + let results = (outs EmitCType:$result); let hasVerifier = 1; let assemblyFormat = "$value attr-dict `:` type($result)"; @@ -932,8 +932,8 @@ def EmitC_ConditionalOp : EmitC_Op<"conditional", int32_t v6 = v3 ? v4 : v5; ``` }]; - let arguments = (ins I1:$condition, AnyType:$true_value, AnyType:$false_value); - let results = (outs AnyType:$result); + let arguments = (ins I1:$condition, EmitCType:$true_value, EmitCType:$false_value); + let results = (outs EmitCType:$result); let assemblyFormat = "operands attr-dict `:` type($result)"; } @@ -1009,7 +1009,7 @@ def EmitC_VariableOp : EmitC_Op<"variable", []> { }]; let arguments = (ins EmitC_OpaqueOrTypedAttr:$value); - let results = (outs AnyType); + let results = (outs EmitCType); let hasVerifier = 1; } @@ -1068,7 +1068,7 @@ def EmitC_AssignOp : EmitC_Op<"assign", []> { ``` }]; - let arguments = (ins AnyType:$var, AnyType:$value); + let arguments = (ins EmitCType:$var, EmitCType:$value); let results = (outs); let hasVerifier = 1; @@ -1089,7 +1089,7 @@ def EmitC_YieldOp : EmitC_Op<"yield", value is yielded. }]; - let arguments = (ins Optional:$result); + let arguments = (ins Optional:$result); let builders = [OpBuilder<(ins), [{ /* nothing to do */ }]>]; let hasVerifier = 1; @@ -1173,8 +1173,8 @@ def EmitC_SubscriptOp : EmitC_Op<"subscript", []> { EmitC_OpaqueType, EmitC_PointerType]>, "the value to subscript">:$value, - Variadic:$indices); - let results = (outs AnyType:$result); + Variadic:$indices); + let results = (outs EmitCType:$result); let builders = [ OpBuilder<(ins "TypedValue":$array, "ValueRange":$indices), [{ diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td index bce5807230ce49..444395b915e250 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTypes.td @@ -22,6 +22,9 @@ include "mlir/IR/BuiltinTypeInterfaces.td" // EmitC type definitions //===----------------------------------------------------------------------===// +def EmitCType : Type, + "type supported by EmitC">; + def EmitCIntegerType : Type, "integer type supported by EmitC">; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td index cee752aeb269b7..7085f81e203a1e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td @@ -50,58 +50,40 @@ def FastmathFlagsInterface : OpInterface<"FastmathFlagsInterface"> { def IntegerOverflowFlagsInterface : OpInterface<"IntegerOverflowFlagsInterface"> { let description = [{ - Access to op integer overflow flags. + This interface defines an LLVM operation with integer overflow flags and + provides a uniform API for accessing them. }]; let cppNamespace = "::mlir::LLVM"; let methods = [ - InterfaceMethod< - /*desc=*/ "Returns an IntegerOverflowFlagsAttr attribute for the operation", - /*returnType=*/ "IntegerOverflowFlagsAttr", - /*methodName=*/ "getOverflowAttr", - /*args=*/ (ins), - /*methodBody=*/ [{}], - /*defaultImpl=*/ [{ - auto op = cast(this->getOperation()); - return op.getOverflowFlagsAttr(); - }] - >, - InterfaceMethod< - /*desc=*/ "Returns whether the operation has the No Unsigned Wrap keyword", - /*returnType=*/ "bool", - /*methodName=*/ "hasNoUnsignedWrap", - /*args=*/ (ins), - /*methodBody=*/ [{}], - /*defaultImpl=*/ [{ - auto op = cast(this->getOperation()); - IntegerOverflowFlags flags = op.getOverflowFlagsAttr().getValue(); - return bitEnumContainsAll(flags, IntegerOverflowFlags::nuw); - }] - >, - InterfaceMethod< - /*desc=*/ "Returns whether the operation has the No Signed Wrap keyword", - /*returnType=*/ "bool", - /*methodName=*/ "hasNoSignedWrap", - /*args=*/ (ins), - /*methodBody=*/ [{}], - /*defaultImpl=*/ [{ - auto op = cast(this->getOperation()); - IntegerOverflowFlags flags = op.getOverflowFlagsAttr().getValue(); - return bitEnumContainsAll(flags, IntegerOverflowFlags::nsw); - }] - >, - StaticInterfaceMethod< - /*desc=*/ [{Returns the name of the IntegerOverflowFlagsAttr attribute - for the operation}], - /*returnType=*/ "StringRef", - /*methodName=*/ "getIntegerOverflowAttrName", - /*args=*/ (ins), - /*methodBody=*/ [{}], - /*defaultImpl=*/ [{ - return "overflowFlags"; - }] - > + InterfaceMethod<[{ + Get the integer overflow flags for the operation. + }], "IntegerOverflowFlags", "getOverflowFlags", (ins), [{}], [{ + return $_op.getProperties().overflowFlags; + }]>, + InterfaceMethod<[{ + Set the integer overflow flags for the operation. + }], "void", "setOverflowFlags", (ins "IntegerOverflowFlags":$flags), [{}], [{ + $_op.getProperties().overflowFlags = flags; + }]>, + InterfaceMethod<[{ + Returns whether the operation has the No Unsigned Wrap keyword. + }], "bool", "hasNoUnsignedWrap", (ins), [{}], [{ + return bitEnumContainsAll($_op.getOverflowFlags(), + IntegerOverflowFlags::nuw); + }]>, + InterfaceMethod<[{ + Returns whether the operation has the No Signed Wrap keyword. + }], "bool", "hasNoSignedWrap", (ins), [{}], [{ + return bitEnumContainsAll($_op.getOverflowFlags(), + IntegerOverflowFlags::nsw); + }]>, + StaticInterfaceMethod<[{ + Get the attribute name of the overflow flags property. + }], "StringRef", "getOverflowFlagsAttrName", (ins), [{}], [{ + return "overflowFlags"; + }]>, ]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index f8f9264b3889be..eedae4b9bb7c8e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -59,17 +59,30 @@ class LLVM_IntArithmeticOpWithOverflowFlag traits = []> : LLVM_ArithmeticOpBase], traits)> { - dag iofArg = ( - ins DefaultValuedAttr:$overflowFlags); + dag iofArg = (ins EnumProperty<"IntegerOverflowFlags">:$overflowFlags); let arguments = !con(commonArgs, iofArg); + + let builders = [ + OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs, + "IntegerOverflowFlags":$overflowFlags), [{ + build($_builder, $_state, type, lhs, rhs); + $_state.getOrAddProperties().overflowFlags = overflowFlags; + }]>, + OpBuilder<(ins "Value":$lhs, "Value":$rhs, + "IntegerOverflowFlags":$overflowFlags), [{ + build($_builder, $_state, lhs, rhs); + $_state.getOrAddProperties().overflowFlags = overflowFlags; + }]> + ]; + string mlirBuilder = [{ auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs); - moduleImport.setIntegerOverflowFlagsAttr(inst, op); + moduleImport.setIntegerOverflowFlags(inst, op); $res = op; }]; let assemblyFormat = [{ - $lhs `,` $rhs (`overflow` `` $overflowFlags^)? - custom(attr-dict) `:` type($res) + $lhs `,` $rhs `` custom($overflowFlags) + `` custom(attr-dict) `:` type($res) }]; string llvmBuilder = "$res = builder.Create" # instName # diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h index ab4df2ab028d43..5e4b4f3a66af9d 100644 --- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h +++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h @@ -87,7 +87,7 @@ struct IndependentParallelIteratorDomainShardingInterface void populateIteratorTypes(Type t, SmallVector &iterTypes) const { - RankedTensorType rankedTensorType = t.dyn_cast(); + RankedTensorType rankedTensorType = dyn_cast(t); if (!rankedTensorType) { return; } @@ -106,7 +106,7 @@ struct ElementwiseShardingInterface ElementwiseShardingInterface, ElemwiseOp> { SmallVector getLoopIteratorTypes(Operation *op) const { Value val = op->getOperand(0); - auto type = val.getType().dyn_cast(); + auto type = dyn_cast(val.getType()); if (!type) return {}; SmallVector types(type.getRank(), @@ -117,7 +117,7 @@ struct ElementwiseShardingInterface SmallVector getIndexingMaps(Operation *op) const { MLIRContext *ctx = op->getContext(); Value val = op->getOperand(0); - auto type = val.getType().dyn_cast(); + auto type = dyn_cast(val.getType()); if (!type) return {}; int64_t rank = type.getRank(); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h index a9bc3351f4cff0..ec3c2cb011c357 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h @@ -60,11 +60,11 @@ class MulOperandsAndResultElementType if (llvm::isa(resElemType)) return impl::verifySameOperandsAndResultElementType(op); - if (auto resIntType = resElemType.dyn_cast()) { + if (auto resIntType = dyn_cast(resElemType)) { IntegerType lhsIntType = - getElementTypeOrSelf(op->getOperand(0)).cast(); + cast(getElementTypeOrSelf(op->getOperand(0))); IntegerType rhsIntType = - getElementTypeOrSelf(op->getOperand(1)).cast(); + cast(getElementTypeOrSelf(op->getOperand(1))); if (lhsIntType != rhsIntType) return op->emitOpError( "requires the same element type for all operands"); diff --git a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt index f33061b2d87cff..9f57627c321fb0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/XeGPU/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/Transforms/CMakeLists.txt new file mode 100644 index 00000000000000..9de7e87c7d3995 --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name XeGPU) +add_public_tablegen_target(MLIRXeGPUPassIncGen) +add_dependencies(mlir-headers MLIRXeGPUPassIncGen) + +add_mlir_doc(Passes XeGPUPasses ./ -gen-pass-doc) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.h new file mode 100644 index 00000000000000..bf55bde4b25b1f --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.h @@ -0,0 +1,35 @@ +//===- Passes.h - XeGPU Patterns and Passes ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_H +#define MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_H + +#include "mlir/Pass/Pass.h" + +namespace mlir { + +namespace xegpu { + +//===----------------------------------------------------------------------===// +// Passes +//===----------------------------------------------------------------------===// + +#define GEN_PASS_DECL +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" + +//===----------------------------------------------------------------------===// +// Registration +//===----------------------------------------------------------------------===// + +#define GEN_PASS_REGISTRATION +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" + +} // namespace xegpu +} // namespace mlir + +#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_H diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td new file mode 100644 index 00000000000000..1ecd6ce95322bd --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -0,0 +1,26 @@ +//===-- Passes.td - XeGPU transformation definition file ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + + +#ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD +#define MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD + +include "mlir/Pass/PassBase.td" + +def XeGPUFoldAliasOps : Pass<"xegpu-fold-alias-ops"> { + let summary = "Fold alias ops into XeGPU ops"; + let description = [{ + The pass folds aliasing ops into XeGPU ops that they operate on the original + source references. + }]; + let dependentDialects = [ + "memref::MemRefDialect", "xegpu::XeGPUDialect" + ]; +} + +#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h new file mode 100644 index 00000000000000..63ea26df069372 --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -0,0 +1,23 @@ +//===- Transforms.h - XeGPU Dialect transformations -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H +#define MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H + +namespace mlir { +class RewritePatternSet; + +namespace xegpu { + +/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. +void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); + +} // namespace xegpu +} // namespace mlir + +#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H diff --git a/mlir/include/mlir/IR/Location.h b/mlir/include/mlir/IR/Location.h index d4268e804f4f7a..aa8314f38cdfac 100644 --- a/mlir/include/mlir/IR/Location.h +++ b/mlir/include/mlir/IR/Location.h @@ -154,7 +154,7 @@ class FusedLocWith : public FusedLoc { /// Support llvm style casting. static bool classof(Attribute attr) { auto fusedLoc = llvm::dyn_cast(attr); - return fusedLoc && fusedLoc.getMetadata().isa_and_nonnull(); + return fusedLoc && mlir::isa_and_nonnull(fusedLoc.getMetadata()); } }; diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h index 5d90c197a6cced..90406f555b0f47 100644 --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -45,6 +45,7 @@ #include "mlir/Dialect/Tosa/Transforms/Passes.h" #include "mlir/Dialect/Transform/Transforms/Passes.h" #include "mlir/Dialect/Vector/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Transforms/Passes.h" #include @@ -92,6 +93,7 @@ inline void registerAllPasses() { arm_sme::registerArmSMEPasses(); arm_sve::registerArmSVEPasses(); emitc::registerEmitCPasses(); + xegpu::registerXeGPUPasses(); // Dialect pipelines bufferization::registerBufferizationPipelines(); diff --git a/mlir/include/mlir/Target/LLVMIR/Import.h b/mlir/include/mlir/Target/LLVMIR/Import.h index 18c5c48bd9504b..4a1242a8eb0590 100644 --- a/mlir/include/mlir/Target/LLVMIR/Import.h +++ b/mlir/include/mlir/Target/LLVMIR/Import.h @@ -14,8 +14,6 @@ #define MLIR_TARGET_LLVMIR_IMPORT_H #include "mlir/IR/OwningOpRef.h" -#include "mlir/Support/LLVM.h" -#include "llvm/ADT/StringRef.h" #include // Forward-declare LLVM classes. @@ -34,12 +32,17 @@ class ModuleOp; /// The translation supports operations from any dialect that has a registered /// implementation of the LLVMImportDialectInterface. It returns nullptr if the /// translation fails and reports errors using the error handler registered with -/// the MLIR context. The `emitExpensiveWarnings` option controls if expensive +/// the MLIR context. +/// The `emitExpensiveWarnings` option controls if expensive /// but uncritical diagnostics should be emitted. +/// The `dropDICompositeTypeElements` option controls if DICompositeTypes should +/// be imported without elements. If set, the option avoids the recursive +/// traversal of composite type debug information, which can be expensive for +/// adversarial inputs. OwningOpRef translateLLVMIRToModule(std::unique_ptr llvmModule, - MLIRContext *context, - bool emitExpensiveWarnings = true); + MLIRContext *context, bool emitExpensiveWarnings = true, + bool dropDICompositeTypeElements = false); /// Translate the given LLVM data layout into an MLIR equivalent using the DLTI /// dialect. diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index b551eb937cfe8d..04d098d38155bf 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -47,7 +47,7 @@ class LoopAnnotationImporter; class ModuleImport { public: ModuleImport(ModuleOp mlirModule, std::unique_ptr llvmModule, - bool emitExpensiveWarnings); + bool emitExpensiveWarnings, bool importEmptyDICompositeTypes); /// Calls the LLVMImportInterface initialization that queries the registered /// dialect interfaces for the supported LLVM IR intrinsics and metadata kinds @@ -183,8 +183,7 @@ class ModuleImport { /// Sets the integer overflow flags (nsw/nuw) attribute for the imported /// operation `op` given the original instruction `inst`. Asserts if the /// operation does not implement the integer overflow flag interface. - void setIntegerOverflowFlagsAttr(llvm::Instruction *inst, - Operation *op) const; + void setIntegerOverflowFlags(llvm::Instruction *inst, Operation *op) const; /// Sets the fastmath flags attribute for the imported operation `op` given /// the original instruction `inst`. Asserts if the operation does not diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp index 4669c40f843d94..21c66f38a8af03 100644 --- a/mlir/lib/CAPI/Dialect/LLVM.cpp +++ b/mlir/lib/CAPI/Dialect/LLVM.cpp @@ -135,7 +135,7 @@ MlirAttribute mlirLLVMDIExpressionAttrGet(MlirContext ctx, intptr_t nOperations, unwrap(ctx), llvm::map_to_vector( unwrapList(nOperations, operations, attrStorage), - [](Attribute a) { return a.cast(); }))); + [](Attribute a) { return cast(a); }))); } MlirAttribute mlirLLVMDINullTypeAttrGet(MlirContext ctx) { @@ -165,7 +165,7 @@ MlirAttribute mlirLLVMDICompositeTypeAttrGet( cast(unwrap(scope)), cast(unwrap(baseType)), DIFlags(flags), sizeInBits, alignInBits, llvm::map_to_vector(unwrapList(nElements, elements, elementsStorage), - [](Attribute a) { return a.cast(); }))); + [](Attribute a) { return cast(a); }))); } MlirAttribute @@ -259,7 +259,7 @@ MlirAttribute mlirLLVMDISubroutineTypeAttrGet(MlirContext ctx, return wrap(DISubroutineTypeAttr::get( unwrap(ctx), callingConvention, llvm::map_to_vector(unwrapList(nTypes, types, attrStorage), - [](Attribute a) { return a.cast(); }))); + [](Attribute a) { return cast(a); }))); } MlirAttribute mlirLLVMDISubprogramAttrGet( diff --git a/mlir/lib/CAPI/IR/BuiltinTypes.cpp b/mlir/lib/CAPI/IR/BuiltinTypes.cpp index e1a5d82587cf9e..c94c070144a7e9 100644 --- a/mlir/lib/CAPI/IR/BuiltinTypes.cpp +++ b/mlir/lib/CAPI/IR/BuiltinTypes.cpp @@ -311,11 +311,11 @@ MlirType mlirVectorTypeGetScalableChecked(MlirLocation loc, intptr_t rank, } bool mlirVectorTypeIsScalable(MlirType type) { - return unwrap(type).cast().isScalable(); + return cast(unwrap(type)).isScalable(); } bool mlirVectorTypeIsDimScalable(MlirType type, intptr_t dim) { - return unwrap(type).cast().getScalableDims()[dim]; + return cast(unwrap(type)).getScalableDims()[dim]; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 7e073bae75c0c9..033e66c6118f30 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -371,7 +371,7 @@ static void wmmaPushInputOperand(ConversionPatternRewriter &rewriter, bool isUnsigned, Value llvmInput, SmallVector &operands) { Type inputType = llvmInput.getType(); - auto vectorType = inputType.dyn_cast(); + auto vectorType = dyn_cast(inputType); Type elemType = vectorType.getElementType(); if (elemType.isBF16()) @@ -414,7 +414,7 @@ static void wmmaPushOutputOperand(ConversionPatternRewriter &rewriter, Value output, int32_t subwordOffset, bool clamp, SmallVector &operands) { Type inputType = output.getType(); - auto vectorType = inputType.dyn_cast(); + auto vectorType = dyn_cast(inputType); Type elemType = vectorType.getElementType(); if (elemType.isBF16()) output = rewriter.create( @@ -569,9 +569,8 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, /// on the architecture you are compiling for. static std::optional wmmaOpToIntrinsic(WMMAOp wmma, Chipset chipset) { - - auto sourceVectorType = wmma.getSourceA().getType().dyn_cast(); - auto destVectorType = wmma.getDestC().getType().dyn_cast(); + auto sourceVectorType = dyn_cast(wmma.getSourceA().getType()); + auto destVectorType = dyn_cast(wmma.getDestC().getType()); auto elemSourceType = sourceVectorType.getElementType(); auto elemDestType = destVectorType.getElementType(); @@ -727,7 +726,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( Type f32 = getTypeConverter()->convertType(op.getResult().getType()); Value source = adaptor.getSource(); - auto sourceVecType = op.getSource().getType().dyn_cast(); + auto sourceVecType = dyn_cast(op.getSource().getType()); Type sourceElemType = getElementTypeOrSelf(op.getSource()); // Extend to a v4i8 if (!sourceVecType || sourceVecType.getNumElements() < 4) { diff --git a/mlir/lib/Conversion/ArithCommon/AttrToLLVMConverter.cpp b/mlir/lib/Conversion/ArithCommon/AttrToLLVMConverter.cpp index f12eba98480d33..cf60a048f782c6 100644 --- a/mlir/lib/Conversion/ArithCommon/AttrToLLVMConverter.cpp +++ b/mlir/lib/Conversion/ArithCommon/AttrToLLVMConverter.cpp @@ -49,13 +49,6 @@ LLVM::IntegerOverflowFlags mlir::arith::convertArithOverflowFlagsToLLVM( return llvmFlags; } -LLVM::IntegerOverflowFlagsAttr mlir::arith::convertArithOverflowAttrToLLVM( - arith::IntegerOverflowFlagsAttr flagsAttr) { - arith::IntegerOverflowFlags arithFlags = flagsAttr.getValue(); - return LLVM::IntegerOverflowFlagsAttr::get( - flagsAttr.getContext(), convertArithOverflowFlagsToLLVM(arithFlags)); -} - LLVM::RoundingMode mlir::arith::convertArithRoundingModeToLLVM(arith::RoundingMode roundingMode) { switch (roundingMode) { diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 0113a3df0b8e3d..3d3ff001c541b5 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -65,7 +65,7 @@ static Value castF32To(Type elementType, Value f32, Location loc, LogicalResult ExtFOnFloat8RewritePattern::match(arith::ExtFOp op) const { Type inType = op.getIn().getType(); - if (auto inVecType = inType.dyn_cast()) { + if (auto inVecType = dyn_cast(inType)) { if (inVecType.isScalable()) return failure(); if (inVecType.getShape().size() > 1) @@ -81,13 +81,13 @@ void ExtFOnFloat8RewritePattern::rewrite(arith::ExtFOp op, Location loc = op.getLoc(); Value in = op.getIn(); Type outElemType = getElementTypeOrSelf(op.getOut().getType()); - if (!in.getType().isa()) { + if (!isa(in.getType())) { Value asFloat = rewriter.create( loc, rewriter.getF32Type(), in, 0); Value result = castF32To(outElemType, asFloat, loc, rewriter); return rewriter.replaceOp(op, result); } - VectorType inType = in.getType().cast(); + VectorType inType = cast(in.getType()); int64_t numElements = inType.getNumElements(); Value zero = rewriter.create( loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0)); @@ -179,7 +179,7 @@ LogicalResult TruncFToFloat8RewritePattern::match(arith::TruncFOp op) const { if (op.getRoundingmodeAttr()) return failure(); Type outType = op.getOut().getType(); - if (auto outVecType = outType.dyn_cast()) { + if (auto outVecType = dyn_cast(outType)) { if (outVecType.isScalable()) return failure(); if (outVecType.getShape().size() > 1) @@ -202,7 +202,7 @@ void TruncFToFloat8RewritePattern::rewrite(arith::TruncFOp op, if (saturateFP8) in = clampInput(rewriter, loc, outElemType, in); VectorType truncResType = VectorType::get(4, outElemType); - if (!in.getType().isa()) { + if (!isa(in.getType())) { Value asFloat = castToF32(in, loc, rewriter); Value asF8s = rewriter.create( loc, truncResType, asFloat, /*sourceB=*/nullptr, 0, @@ -210,7 +210,7 @@ void TruncFToFloat8RewritePattern::rewrite(arith::TruncFOp op, Value result = rewriter.create(loc, asF8s, 0); return rewriter.replaceOp(op, result); } - VectorType outType = op.getOut().getType().cast(); + VectorType outType = cast(op.getOut().getType()); int64_t numElements = outType.getNumElements(); Value zero = rewriter.create( loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0)); diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 993c09b03c0fde..36e10372e4bc5b 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -214,7 +214,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, llvm::enumerate(gpuFuncOp.getArgumentTypes())) { auto remapping = signatureConversion.getInputMapping(idx); NamedAttrList argAttr = - argAttrs ? argAttrs[idx].cast() : NamedAttrList(); + argAttrs ? cast(argAttrs[idx]) : NamedAttrList(); auto copyAttribute = [&](StringRef attrName) { Attribute attr = argAttr.erase(attrName); if (!attr) @@ -234,9 +234,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, return; } for (size_t i = 0, e = remapping->size; i < e; ++i) { - if (llvmFuncOp.getArgument(remapping->inputNo + i) - .getType() - .isa()) { + if (isa( + llvmFuncOp.getArgument(remapping->inputNo + i).getType())) { llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr); } } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 78d4e806246872..3a4fc7d8063f40 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -668,7 +668,7 @@ static int32_t getCuSparseLtDataTypeFrom(Type type) { static int32_t getCuSparseDataTypeFrom(Type type) { if (llvm::isa(type)) { // get the element type - auto elementType = type.cast().getElementType(); + auto elementType = cast(type).getElementType(); if (elementType.isBF16()) return 15; // CUDA_C_16BF if (elementType.isF16()) diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp index 83c31a204efc7e..1886dfa870961a 100644 --- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp +++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp @@ -329,14 +329,19 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors( // Detail methods //===----------------------------------------------------------------------===// +void LLVM::detail::setNativeProperties(Operation *op, + IntegerOverflowFlags overflowFlags) { + if (auto iface = dyn_cast(op)) + iface.setOverflowFlags(overflowFlags); +} + /// Replaces the given operation "op" with a new operation of type "targetOp" /// and given operands. -LogicalResult -LLVM::detail::oneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - ArrayRef targetAttrs, - const LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter) { +LogicalResult LLVM::detail::oneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + ArrayRef targetAttrs, + const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter, + IntegerOverflowFlags overflowFlags) { unsigned numResults = op->getNumResults(); SmallVector resultTypes; @@ -352,6 +357,8 @@ LLVM::detail::oneToOneRewrite(Operation *op, StringRef targetOp, rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp), operands, resultTypes, targetAttrs); + setNativeProperties(newOp, overflowFlags); + // If the operation produced 0 or 1 result, return them immediately. if (numResults == 0) return rewriter.eraseOp(op), success(); diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp index 544bcc71aca1b5..626135c10a3e96 100644 --- a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp +++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp @@ -103,12 +103,11 @@ LogicalResult LLVM::detail::handleMultidimensionalVectors( return success(); } -LogicalResult -LLVM::detail::vectorOneToOneRewrite(Operation *op, StringRef targetOp, - ValueRange operands, - ArrayRef targetAttrs, - const LLVMTypeConverter &typeConverter, - ConversionPatternRewriter &rewriter) { +LogicalResult LLVM::detail::vectorOneToOneRewrite( + Operation *op, StringRef targetOp, ValueRange operands, + ArrayRef targetAttrs, + const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter, + IntegerOverflowFlags overflowFlags) { assert(!operands.empty()); // Cannot convert ops if their operands are not of LLVM type. @@ -118,14 +117,15 @@ LLVM::detail::vectorOneToOneRewrite(Operation *op, StringRef targetOp, auto llvmNDVectorTy = operands[0].getType(); if (!isa(llvmNDVectorTy)) return oneToOneRewrite(op, targetOp, operands, targetAttrs, typeConverter, - rewriter); + rewriter, overflowFlags); - auto callback = [op, targetOp, targetAttrs, &rewriter](Type llvm1DVectorTy, - ValueRange operands) { - return rewriter - .create(op->getLoc(), rewriter.getStringAttr(targetOp), operands, - llvm1DVectorTy, targetAttrs) - ->getResult(0); + auto callback = [op, targetOp, targetAttrs, overflowFlags, + &rewriter](Type llvm1DVectorTy, ValueRange operands) { + Operation *newOp = + rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp), + operands, llvm1DVectorTy, targetAttrs); + LLVM::detail::setNativeProperties(newOp, overflowFlags); + return newOp->getResult(0); }; return handleMultidimensionalVectors(op, operands, typeConverter, callback, diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index 9b5d19ebd783a9..11d29754aa760e 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -1579,7 +1579,7 @@ struct NVGPUWarpgroupMmaStoreOpLowering if (offset) ti = makeAdd(ti, makeConst(offset)); - auto structType = matrixD.getType().cast(); + auto structType = cast(matrixD.getType()); // Number of 32-bit registers owns per thread constexpr unsigned numAdjacentRegisters = 2; @@ -1606,9 +1606,9 @@ struct NVGPUWarpgroupMmaStoreOpLowering int offset = 0; ImplicitLocOpBuilder b(op->getLoc(), rewriter); Value matriDValue = adaptor.getMatrixD(); - auto stype = matriDValue.getType().cast(); + auto stype = cast(matriDValue.getType()); for (auto [idx, matrixD] : llvm::enumerate(stype.getBody())) { - auto structType = matrixD.cast(); + auto structType = cast(matrixD); Value innerStructValue = b.create(matriDValue, idx); storeFragmentedMatrix(b, innerStructValue, op.getDstMemref(), offset); offset += structType.getBody().size(); @@ -1626,13 +1626,9 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering matchAndRewrite(nvgpu::WarpgroupMmaInitAccumulatorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { ImplicitLocOpBuilder b(op->getLoc(), rewriter); - LLVM::LLVMStructType packStructType = - getTypeConverter() - ->convertType(op.getMatrixC().getType()) - .cast(); - Type elemType = packStructType.getBody() - .front() - .cast() + LLVM::LLVMStructType packStructType = cast( + getTypeConverter()->convertType(op.getMatrixC().getType())); + Type elemType = cast(packStructType.getBody().front()) .getBody() .front(); Value zero = b.create(elemType, b.getZeroAttr(elemType)); @@ -1640,7 +1636,7 @@ struct NVGPUWarpgroupMmaInitAccumulatorOpLowering SmallVector innerStructs; // Unpack the structs and set all values to zero for (auto [idx, s] : llvm::enumerate(packStructType.getBody())) { - auto structType = s.cast(); + auto structType = cast(s); Value structValue = b.create(packStruct, idx); for (unsigned i = 0; i < structType.getBody().size(); ++i) { structValue = b.create( diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index ef8d59c9b26082..b6b85cab5a3821 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -618,7 +618,7 @@ static Value expandRank(PatternRewriter &rewriter, Location loc, Value tensor, static SmallVector expandInputRanks(PatternRewriter &rewriter, Location loc, Operation *operation) { auto rank = - operation->getResultTypes().front().cast().getRank(); + cast(operation->getResultTypes().front()).getRank(); return llvm::map_to_vector(operation->getOperands(), [&](Value operand) { return expandRank(rewriter, loc, operand, rank); }); @@ -680,7 +680,7 @@ computeTargetSize(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, // dimension, that is the target size. An occurrence of an additional static // dimension greater than 1 with a different value is undefined behavior. for (auto operand : operands) { - auto size = operand.getType().cast().getDimSize(dim); + auto size = cast(operand.getType()).getDimSize(dim); if (!ShapedType::isDynamic(size) && size > 1) return {rewriter.getIndexAttr(size), operand}; } @@ -688,7 +688,7 @@ computeTargetSize(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, // Filter operands with dynamic dimension auto operandsWithDynamicDim = llvm::to_vector(llvm::make_filter_range(operands, [&](Value operand) { - return operand.getType().cast().isDynamicDim(dim); + return cast(operand.getType()).isDynamicDim(dim); })); // If no operand has a dynamic dimension, it means all sizes were 1 @@ -718,7 +718,7 @@ static std::pair, SmallVector> computeTargetShape(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, ValueRange operands) { assert(!operands.empty()); - auto rank = operands.front().getType().cast().getRank(); + auto rank = cast(operands.front().getType()).getRank(); SmallVector targetShape; SmallVector masterOperands; for (auto dim : llvm::seq(0, rank)) { @@ -735,7 +735,7 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc, int64_t dim, OpFoldResult targetSize, Value masterOperand) { // Nothing to do if this is a static dimension - auto rankedTensorType = operand.getType().cast(); + auto rankedTensorType = cast(operand.getType()); if (!rankedTensorType.isDynamicDim(dim)) return operand; @@ -817,7 +817,7 @@ static Value broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value operand, ArrayRef targetShape, ArrayRef masterOperands) { - int64_t rank = operand.getType().cast().getRank(); + int64_t rank = cast(operand.getType()).getRank(); assert((int64_t)targetShape.size() == rank); assert((int64_t)masterOperands.size() == rank); for (auto index : llvm::seq(0, rank)) @@ -848,8 +848,7 @@ emitElementwiseComputation(PatternRewriter &rewriter, Location loc, Operation *operation, ValueRange operands, ArrayRef targetShape) { // Generate output tensor - auto resultType = - operation->getResultTypes().front().cast(); + auto resultType = cast(operation->getResultTypes().front()); Value outputTensor = rewriter.create( loc, targetShape, resultType.getElementType()); @@ -2274,8 +2273,7 @@ struct RFFT2dConverter final : public OpRewritePattern { llvm::SmallVector staticSizes; dispatchIndexOpFoldResults(dims, dynamicSizes, staticSizes); - auto elementType = - input.getType().cast().getElementType(); + auto elementType = cast(input.getType()).getElementType(); return RankedTensorType::get(staticSizes, elementType); } @@ -2327,7 +2325,7 @@ struct RFFT2dConverter final : public OpRewritePattern { auto loc = rfft2d.getLoc(); auto input = rfft2d.getInput(); auto elementType = - input.getType().cast().getElementType().cast(); + cast(cast(input.getType()).getElementType()); // Compute the output type and set of dynamic sizes llvm::SmallVector dynamicSizes; diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index 399c0450824ee5..3f92372d7cea98 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -1204,10 +1204,10 @@ convertElementwiseOp(RewriterBase &rewriter, Operation *op, return rewriter.notifyMatchFailure(op, "no mapping"); matrixOperands.push_back(it->second); } - auto resultType = matrixOperands[0].getType().cast(); + auto resultType = cast(matrixOperands[0].getType()); if (opType == gpu::MMAElementwiseOp::EXTF) { // The floating point extension case has a different result type. - auto vectorType = op->getResultTypes()[0].cast(); + auto vectorType = cast(op->getResultTypes()[0]); resultType = gpu::MMAMatrixType::get(resultType.getShape(), vectorType.getElementType(), resultType.getOperand()); diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 85d10f326e260e..1b9975237c699b 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -631,8 +631,7 @@ static Value createMaskNeutralValue(ConversionPatternRewriter &rewriter, Type vectorType) { const auto &floatSemantics = cast(llvmType).getFloatSemantics(); auto value = getMaskNeutralValue(MaskNeutral{}, floatSemantics); - auto denseValue = - DenseElementsAttr::get(vectorType.cast(), value); + auto denseValue = DenseElementsAttr::get(cast(vectorType), value); return rewriter.create(loc, vectorType, denseValue); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 2575ad4984814b..e3beceaa3bbb5b 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -227,8 +227,8 @@ LogicalResult WMMAOp::verify() { Type sourceAType = getSourceA().getType(); Type destType = getDestC().getType(); - VectorType sourceVectorAType = sourceAType.dyn_cast(); - VectorType destVectorType = destType.dyn_cast(); + VectorType sourceVectorAType = dyn_cast(sourceAType); + VectorType destVectorType = dyn_cast(destType); Type sourceAElemType = sourceVectorAType.getElementType(); Type destElemType = destVectorType.getElementType(); diff --git a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp index d7492c9e25db31..5e69a98db8f1ee 100644 --- a/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.cpp @@ -26,7 +26,7 @@ struct ConstantOpInterface LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto constantOp = cast(op); - auto type = constantOp.getType().dyn_cast(); + auto type = dyn_cast(constantOp.getType()); // Only ranked tensors are supported. if (!type) diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp index b9ab95b92496e3..4a50da3513f99b 100644 --- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp @@ -106,7 +106,7 @@ void mlir::arith::populateEmulateUnsupportedFloatsConversions( targetType](Type type) -> std::optional { if (llvm::is_contained(sourceTypes, type)) return targetType; - if (auto shaped = type.dyn_cast()) + if (auto shaped = dyn_cast(type)) if (llvm::is_contained(sourceTypes, shaped.getElementType())) return shaped.clone(targetType); // All other types legal diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp index 3ae894692089b3..3635cd319a0785 100644 --- a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp +++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp @@ -99,7 +99,7 @@ class LowerContractionToSMMLAPattern Value extsiLhs; Value extsiRhs; if (auto lhsExtInType = - origLhsExtOp.getIn().getType().dyn_cast()) { + dyn_cast(origLhsExtOp.getIn().getType())) { if (lhsExtInType.getElementTypeBitWidth() <= 8) { Type targetLhsExtTy = matchContainerType(rewriter.getI8Type(), lhsExtInType); @@ -108,7 +108,7 @@ class LowerContractionToSMMLAPattern } } if (auto rhsExtInType = - origRhsExtOp.getIn().getType().dyn_cast()) { + dyn_cast(origRhsExtOp.getIn().getType())) { if (rhsExtInType.getElementTypeBitWidth() <= 8) { Type targetRhsExtTy = matchContainerType(rewriter.getI8Type(), rhsExtInType); @@ -133,6 +133,9 @@ class LowerContractionToSMMLAPattern smmlaShape.insert(smmlaShape.begin(), isVecmat ? 1 : 2); loopOrder.push_back(2); } + + // Keep track of the previous accumulator when tiling over K. + Value kAcc; for (SmallVector offsets : StaticTileOffsetRange(unrolledSize, smmlaShape, loopOrder)) { // Helper to compute the new shape of each operand and extract the slice. @@ -161,9 +164,9 @@ class LowerContractionToSMMLAPattern extractOperand(op.getAcc(), accPermutationMap, accOffsets); auto inputElementType = - tiledLhs.getType().cast().getElementType(); + cast(tiledLhs.getType()).getElementType(); auto accElementType = - tiledAcc.getType().cast().getElementType(); + cast(tiledAcc.getType()).getElementType(); auto inputExpandedType = VectorType::get({2, 8}, inputElementType); auto outputExpandedType = VectorType::get({2, 2}, accElementType); @@ -175,9 +178,9 @@ class LowerContractionToSMMLAPattern auto emptyOperand = rewriter.create( loc, expandedTypeType, rewriter.getZeroAttr(expandedTypeType)); SmallVector offsets( - emptyOperand.getType().cast().getRank(), 0); + cast(emptyOperand.getType()).getRank(), 0); SmallVector strides( - tiledOperand.getType().cast().getRank(), 1); + cast(tiledOperand.getType()).getRank(), 1); return rewriter.createOrFold( loc, tiledOperand, emptyOperand, offsets, strides); }; @@ -194,19 +197,26 @@ class LowerContractionToSMMLAPattern tiledRhs.getLoc(), collapsedInputType, tiledRhs); auto collapsedOutputType = VectorType::get(outputExpandedType.getNumElements(), accElementType); - auto collapsedRes = rewriter.createOrFold( - tiledAcc.getLoc(), collapsedOutputType, tiledAcc); + + bool initialKAcc = offsets.back() == 0; + Value collapsedRes; + if (!initialKAcc) { + collapsedRes = kAcc; + } else { + collapsedRes = rewriter.createOrFold( + tiledAcc.getLoc(), collapsedOutputType, tiledAcc); + } // Insert contract op - auto smmlaOp = rewriter.createOrFold( + kAcc = rewriter.createOrFold( op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs, collapsedRhs); // Reshape output back to 2D Value tiledRes = rewriter.createOrFold( - smmlaOp.getLoc(), tiledAcc.getType(), smmlaOp); + kAcc.getLoc(), tiledAcc.getType(), kAcc); - // With vecmat, only one row of tiled ACC can be inserted inot file result + // With vecmat, only one row of tiled ACC can be inserted into file result if (isVecmat) { tiledRes = rewriter.createOrFold(loc, tiledRes, 0); } @@ -214,7 +224,7 @@ class LowerContractionToSMMLAPattern // Insert the tiled result back into the non tiled result of the // contract op. SmallVector strides( - tiledRes.getType().cast().getRank(), 1); + cast(tiledRes.getType()).getRank(), 1); result = rewriter.createOrFold( loc, tiledRes, result, accOffsets, strides); } diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp index a5ea42b7d701d0..b197786c320548 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp @@ -39,7 +39,7 @@ static Value buildBoolValue(OpBuilder &builder, Location loc, bool value) { return builder.create(loc, builder.getBoolAttr(value)); } -static bool isMemref(Value v) { return v.getType().isa(); } +static bool isMemref(Value v) { return isa(v.getType()); } //===----------------------------------------------------------------------===// // Ownership @@ -222,8 +222,8 @@ bool ValueComparator::operator()(const Value &lhs, const Value &rhs) const { return false; // Block arguments are less than results. - bool lhsIsBBArg = lhs.isa(); - if (lhsIsBBArg != rhs.isa()) { + bool lhsIsBBArg = isa(lhs); + if (lhsIsBBArg != isa(rhs)) { return lhsIsBBArg; } diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index c2b2b99fc0083b..d51d63f243ea0c 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -684,7 +684,7 @@ bufferization::getBufferType(Value value, const BufferizationOptions &options, // Op is not bufferizable. auto memSpace = - options.defaultMemorySpaceFn(value.getType().cast()); + options.defaultMemorySpaceFn(cast(value.getType())); if (!memSpace.has_value()) return op->emitError("could not infer memory space"); @@ -939,7 +939,7 @@ FailureOr bufferization::detail::defaultGetBufferType( // If we do not know the memory space and there is no default memory space, // report a failure. auto memSpace = - options.defaultMemorySpaceFn(value.getType().cast()); + options.defaultMemorySpaceFn(cast(value.getType())); if (!memSpace.has_value()) return op->emitError("could not infer memory space"); @@ -987,7 +987,7 @@ bufferization::detail::unknownGetAliasingValues(OpOperand &opOperand) { for (Region ®ion : opOperand.getOwner()->getRegions()) if (!region.getBlocks().empty()) for (BlockArgument bbArg : region.getBlocks().front().getArguments()) - if (bbArg.getType().isa()) + if (isa(bbArg.getType())) r.addAlias({bbArg, BufferRelation::Unknown, /*isDefinite=*/false}); return r; } diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp index c9fd110d48d9a9..a8ec111f8c304b 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp @@ -46,7 +46,7 @@ static Value buildBoolValue(OpBuilder &builder, Location loc, bool value) { return builder.create(loc, builder.getBoolAttr(value)); } -static bool isMemref(Value v) { return v.getType().isa(); } +static bool isMemref(Value v) { return isa(v.getType()); } /// Return "true" if the given op is guaranteed to have neither "Allocate" nor /// "Free" side effects. diff --git a/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp b/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp index 1c81433bc3e945..fb97045687d653 100644 --- a/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp +++ b/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp @@ -378,7 +378,7 @@ OpFoldResult DivOp::fold(FoldAdaptor adaptor) { if (!rhs) return {}; - ArrayAttr arrayAttr = rhs.dyn_cast(); + ArrayAttr arrayAttr = dyn_cast(rhs); if (!arrayAttr || arrayAttr.size() != 2) return {}; diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp index 0dc357c2298fad..89546da428fa20 100644 --- a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp @@ -17,7 +17,7 @@ using namespace mlir; using namespace mlir::bufferization; -static bool isMemref(Value v) { return v.getType().isa(); } +static bool isMemref(Value v) { return isa(v.getType()); } namespace { /// While CondBranchOp also implement the BranchOpInterface, we add a diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp index 7cbf28b627342a..66a71df29a9bb2 100644 --- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp +++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp @@ -10,11 +10,15 @@ #include "mlir/Dialect/EmitC/IR/EmitCTraits.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/IRMapping.h" +#include "mlir/IR/Types.h" #include "mlir/Interfaces/FunctionImplementation.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Casting.h" using namespace mlir; using namespace mlir::emitc; @@ -54,6 +58,40 @@ void mlir::emitc::buildTerminatedBody(OpBuilder &builder, Location loc) { builder.create(loc); } +bool mlir::emitc::isSupportedEmitCType(Type type) { + if (llvm::isa(type)) + return true; + if (auto ptrType = llvm::dyn_cast(type)) + return isSupportedEmitCType(ptrType.getPointee()); + if (auto arrayType = llvm::dyn_cast(type)) { + auto elemType = arrayType.getElementType(); + return !llvm::isa(elemType) && + isSupportedEmitCType(elemType); + } + if (type.isIndex()) + return true; + if (llvm::isa(type)) + return isSupportedIntegerType(type); + if (llvm::isa(type)) + return isSupportedFloatType(type); + if (auto tensorType = llvm::dyn_cast(type)) { + if (!tensorType.hasStaticShape()) { + return false; + } + auto elemType = tensorType.getElementType(); + if (llvm::isa(elemType)) { + return false; + } + return isSupportedEmitCType(elemType); + } + if (auto tupleType = llvm::dyn_cast(type)) { + return llvm::all_of(tupleType.getTypes(), [](Type type) { + return !llvm::isa(type) && isSupportedEmitCType(type); + }); + } + return false; +} + bool mlir::emitc::isSupportedIntegerType(Type type) { if (auto intType = llvm::dyn_cast(type)) { switch (intType.getWidth()) { @@ -122,13 +160,13 @@ LogicalResult AddOp::verify() { Type lhsType = getLhs().getType(); Type rhsType = getRhs().getType(); - if (lhsType.isa() && rhsType.isa()) + if (isa(lhsType) && isa(rhsType)) return emitOpError("requires that at most one operand is a pointer"); - if ((lhsType.isa() && - !rhsType.isa()) || - (rhsType.isa() && - !lhsType.isa())) + if ((isa(lhsType) && + !isa(rhsType)) || + (isa(rhsType) && + !isa(lhsType))) return emitOpError("requires that one operand is an integer or of opaque " "type if the other is a pointer"); @@ -740,16 +778,16 @@ LogicalResult SubOp::verify() { Type rhsType = getRhs().getType(); Type resultType = getResult().getType(); - if (rhsType.isa() && !lhsType.isa()) + if (isa(rhsType) && !isa(lhsType)) return emitOpError("rhs can only be a pointer if lhs is a pointer"); - if (lhsType.isa() && - !rhsType.isa()) + if (isa(lhsType) && + !isa(rhsType)) return emitOpError("requires that rhs is an integer, pointer or of opaque " "type if lhs is a pointer"); - if (lhsType.isa() && rhsType.isa() && - !resultType.isa()) + if (isa(lhsType) && isa(rhsType) && + !isa(resultType)) return emitOpError("requires that the result is an integer or of opaque " "type if lhs and rhs are pointers"); return success(); diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp index b584f63f16e0aa..3661c5dea45259 100644 --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -196,7 +196,7 @@ getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) { auto extract = dyn_cast(users); if (!extract) return std::nullopt; - auto vecType = extract.getResult().getType().cast(); + auto vecType = cast(extract.getResult().getType()); if (sliceType && sliceType != vecType) return std::nullopt; sliceType = vecType; @@ -204,7 +204,7 @@ getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) { return llvm::to_vector(sliceType.getShape()); } if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) { - if (auto vecType = op->getResultTypes()[0].dyn_cast()) { + if (auto vecType = dyn_cast(op->getResultTypes()[0])) { // TODO: The condition for unrolling elementwise should be restricted // only to operations that need unrolling (connected to the contract). if (vecType.getRank() < 2) @@ -219,7 +219,7 @@ getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) { auto extract = dyn_cast(users); if (!extract) return std::nullopt; - auto vecType = extract.getResult().getType().cast(); + auto vecType = cast(extract.getResult().getType()); if (sliceType && sliceType != vecType) return std::nullopt; sliceType = vecType; diff --git a/mlir/lib/Dialect/IRDL/IRDLLoading.cpp b/mlir/lib/Dialect/IRDL/IRDLLoading.cpp index 9ab7ae2a90820e..cfc8d092c8178a 100644 --- a/mlir/lib/Dialect/IRDL/IRDLLoading.cpp +++ b/mlir/lib/Dialect/IRDL/IRDLLoading.cpp @@ -354,7 +354,7 @@ static WalkResult loadOperation( // Gather the variadicities of each result for (Attribute attr : resultsOp->getVariadicity()) - resultVariadicity.push_back(attr.cast().getValue()); + resultVariadicity.push_back(cast(attr).getValue()); } // Gather which constraint slots correspond to attributes constraints @@ -367,7 +367,7 @@ static WalkResult loadOperation( for (const auto &[name, value] : llvm::zip(names, values)) { for (auto [i, constr] : enumerate(constrToValue)) { if (constr == value) { - attributesContraints[name.cast()] = i; + attributesContraints[cast(name)] = i; break; } } diff --git a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp index f3b674fdb50501..f7f1e944d637d0 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp @@ -42,7 +42,7 @@ static char getRegisterType(Type type) { return 'f'; if (type.isF64()) return 'd'; - if (auto ptr = type.dyn_cast()) { + if (auto ptr = dyn_cast(type)) { // Shared address spaces is addressed with 32-bit pointers. if (ptr.getAddressSpace() == kSharedMemorySpace) { return 'r'; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index f90240a67dcc5f..78ff24dae68b4c 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -47,6 +47,74 @@ using mlir::LLVM::linkage::getMaxEnumValForLinkage; #include "mlir/Dialect/LLVMIR/LLVMOpsDialect.cpp.inc" +//===----------------------------------------------------------------------===// +// Property Helpers +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// IntegerOverflowFlags + +namespace mlir { +static Attribute convertToAttribute(MLIRContext *ctx, + IntegerOverflowFlags flags) { + return IntegerOverflowFlagsAttr::get(ctx, flags); +} + +static LogicalResult +convertFromAttribute(IntegerOverflowFlags &flags, Attribute attr, + function_ref emitError) { + auto flagsAttr = dyn_cast(attr); + if (!flagsAttr) { + return emitError() << "expected 'overflowFlags' attribute to be an " + "IntegerOverflowFlagsAttr, but got " + << attr; + } + flags = flagsAttr.getValue(); + return success(); +} +} // namespace mlir + +static ParseResult parseOverflowFlags(AsmParser &p, + IntegerOverflowFlags &flags) { + if (failed(p.parseOptionalKeyword("overflow"))) { + flags = IntegerOverflowFlags::none; + return success(); + } + if (p.parseLess()) + return failure(); + do { + StringRef kw; + SMLoc loc = p.getCurrentLocation(); + if (p.parseKeyword(&kw)) + return failure(); + std::optional flag = + symbolizeIntegerOverflowFlags(kw); + if (!flag) + return p.emitError(loc, + "invalid overflow flag: expected nsw, nuw, or none"); + flags = flags | *flag; + } while (succeeded(p.parseOptionalComma())); + return p.parseGreater(); +} + +static void printOverflowFlags(AsmPrinter &p, Operation *op, + IntegerOverflowFlags flags) { + if (flags == IntegerOverflowFlags::none) + return; + p << " overflow<"; + SmallVector strs; + if (bitEnumContainsAny(flags, IntegerOverflowFlags::nsw)) + strs.push_back("nsw"); + if (bitEnumContainsAny(flags, IntegerOverflowFlags::nuw)) + strs.push_back("nuw"); + llvm::interleaveComma(strs, p); + p << ">"; +} + +//===----------------------------------------------------------------------===// +// Attribute Helpers +//===----------------------------------------------------------------------===// + static constexpr const char kElemTypeAttrName[] = "elem_type"; static auto processFMFAttr(ArrayRef attrs) { @@ -70,12 +138,12 @@ static ParseResult parseLLVMOpAttrs(OpAsmParser &parser, static void printLLVMOpAttrs(OpAsmPrinter &printer, Operation *op, DictionaryAttr attrs) { auto filteredAttrs = processFMFAttr(attrs.getValue()); - if (auto iface = dyn_cast(op)) + if (auto iface = dyn_cast(op)) { printer.printOptionalAttrDict( - filteredAttrs, - /*elidedAttrs=*/{iface.getIntegerOverflowAttrName()}); - else + filteredAttrs, /*elidedAttrs=*/{iface.getOverflowFlagsAttrName()}); + } else { printer.printOptionalAttrDict(filteredAttrs); + } } /// Verifies `symbol`'s use in `op` to ensure the symbol is a valid and @@ -559,7 +627,7 @@ static void destructureIndices(Type currType, ArrayRef indices, // we don't do anything here. The verifier will catch it and emit a proper // error. All other canonicalization is done in the fold method. bool requiresConst = !rawConstantIndices.empty() && - currType.isa_and_nonnull(); + isa_and_nonnull(currType); if (Value val = llvm::dyn_cast_if_present(iter)) { APInt intC; if (requiresConst && matchPattern(val, m_ConstantInt(&intC)) && @@ -2564,14 +2632,14 @@ LogicalResult LLVM::ConstantOp::verify() { } // See the comment for getLLVMConstant for more details about why 8-bit // floats can be represented by integers. - if (getType().isa() && !getType().isInteger(floatWidth)) { + if (isa(getType()) && !getType().isInteger(floatWidth)) { return emitOpError() << "expected integer type of width " << floatWidth; } } if (auto splatAttr = dyn_cast(getValue())) { - if (!getType().isa() && !getType().isa() && - !getType().isa() && - !getType().isa()) + if (!isa(getType()) && !isa(getType()) && + !isa(getType()) && + !isa(getType())) return emitOpError() << "expected vector or array type"; } return success(); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp index 93901477b58204..f2ab3eae2c343e 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp @@ -319,7 +319,7 @@ LogicalResult LLVM::StoreOp::ensureOnlySafeAccesses( static Type getTypeAtIndex(const DestructurableMemorySlot &slot, Attribute index) { auto subelementIndexMap = - slot.elemType.cast().getSubelementIndexMap(); + cast(slot.elemType).getSubelementIndexMap(); if (!subelementIndexMap) return {}; assert(!subelementIndexMap->empty()); @@ -913,8 +913,7 @@ bool LLVM::MemsetOp::canRewire(const DestructurableMemorySlot &slot, if (getIsVolatile()) return false; - if (!slot.elemType.cast() - .getSubelementIndexMap()) + if (!cast(slot.elemType).getSubelementIndexMap()) return false; if (!areAllIndicesI32(slot)) @@ -928,7 +927,7 @@ DeletionKind LLVM::MemsetOp::rewire(const DestructurableMemorySlot &slot, RewriterBase &rewriter, const DataLayout &dataLayout) { std::optional> types = - slot.elemType.cast().getSubelementIndexMap(); + cast(slot.elemType).getSubelementIndexMap(); IntegerAttr memsetLenAttr; bool successfulMatch = @@ -1047,8 +1046,7 @@ static bool memcpyCanRewire(MemcpyLike op, const DestructurableMemorySlot &slot, if (op.getIsVolatile()) return false; - if (!slot.elemType.cast() - .getSubelementIndexMap()) + if (!cast(slot.elemType).getSubelementIndexMap()) return false; if (!areAllIndicesI32(slot)) diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp index b264e9ff9283d4..0a372ad0c52fcd 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp @@ -475,7 +475,7 @@ LogicalResult SplitStores::matchAndRewrite(StoreOp store, } } - auto destructurableType = typeHint.dyn_cast(); + auto destructurableType = dyn_cast(typeHint); if (!destructurableType) return failure(); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp index 3e85559e1ec0c6..768df0953fc5c8 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp @@ -202,9 +202,9 @@ DiagnosedSilenceableFailure transform::MatchStructuredBodyOp::matchOperation( body, [&](Operation *elem, Operation *red) { return elem->getName().getStringRef() == - (*contractionOps)[0].cast().getValue() && + cast((*contractionOps)[0]).getValue() && red->getName().getStringRef() == - (*contractionOps)[1].cast().getValue(); + cast((*contractionOps)[1]).getValue(); }, os); if (result) @@ -259,11 +259,11 @@ transform::MatchStructuredClassifyContractionDimsOp::matchOperation( return builder.getI64IntegerAttr(value); })); }; - results.setParams(getBatch().cast(), + results.setParams(cast(getBatch()), makeI64Attrs(contractionDims->batch)); - results.setParams(getM().cast(), makeI64Attrs(contractionDims->m)); - results.setParams(getN().cast(), makeI64Attrs(contractionDims->n)); - results.setParams(getK().cast(), makeI64Attrs(contractionDims->k)); + results.setParams(cast(getM()), makeI64Attrs(contractionDims->m)); + results.setParams(cast(getN()), makeI64Attrs(contractionDims->n)); + results.setParams(cast(getK()), makeI64Attrs(contractionDims->k)); return DiagnosedSilenceableFailure::success(); } @@ -288,17 +288,17 @@ transform::MatchStructuredClassifyConvolutionDimsOp::matchOperation( return builder.getI64IntegerAttr(value); })); }; - results.setParams(getBatch().cast(), + results.setParams(cast(getBatch()), makeI64Attrs(convolutionDims->batch)); - results.setParams(getOutputImage().cast(), + results.setParams(cast(getOutputImage()), makeI64Attrs(convolutionDims->outputImage)); - results.setParams(getOutputChannel().cast(), + results.setParams(cast(getOutputChannel()), makeI64Attrs(convolutionDims->outputChannel)); - results.setParams(getFilterLoop().cast(), + results.setParams(cast(getFilterLoop()), makeI64Attrs(convolutionDims->filterLoop)); - results.setParams(getInputChannel().cast(), + results.setParams(cast(getInputChannel()), makeI64Attrs(convolutionDims->inputChannel)); - results.setParams(getDepth().cast(), + results.setParams(cast(getDepth()), makeI64Attrs(convolutionDims->depth)); auto makeI64AttrsFromI64 = [&](ArrayRef values) { @@ -307,9 +307,9 @@ transform::MatchStructuredClassifyConvolutionDimsOp::matchOperation( return builder.getI64IntegerAttr(value); })); }; - results.setParams(getStrides().cast(), + results.setParams(cast(getStrides()), makeI64AttrsFromI64(convolutionDims->strides)); - results.setParams(getDilations().cast(), + results.setParams(cast(getDilations()), makeI64AttrsFromI64(convolutionDims->dilations)); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 7e7cf1d0244613..3c3d968fbb865e 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1219,7 +1219,7 @@ transform::MatchOp::apply(transform::TransformRewriter &rewriter, // All the operands must must be equal to the specified type auto typeattr = dyn_cast(getFilterOperandTypes().value()[0]); - Type t = typeattr.getValue().cast<::mlir::Type>(); + Type t = cast<::mlir::Type>(typeattr.getValue()); if (!llvm::all_of(op->getOperandTypes(), [&](Type operandType) { return operandType == t; })) return; @@ -1234,7 +1234,7 @@ transform::MatchOp::apply(transform::TransformRewriter &rewriter, for (auto [attr, operandType] : llvm::zip_equal(getFilterOperandTypes().value(), operandTypes)) { auto typeattr = cast(attr); - Type type = typeattr.getValue().cast<::mlir::Type>(); + Type type = cast<::mlir::Type>(typeattr.getValue()); if (type != operandType) return; @@ -2665,7 +2665,7 @@ transform::TileUsingForOp::apply(transform::TransformRewriter &rewriter, if (auto attr = llvm::dyn_cast_if_present(ofr)) { if (scalableSizes[ofrIdx]) { auto val = b.create( - getLoc(), attr.cast().getInt()); + getLoc(), cast(attr).getInt()); Value vscale = b.create(getLoc(), b.getIndexType()); sizes.push_back( diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp index b95677b7457e63..59c189fa1fbadc 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp @@ -60,7 +60,7 @@ static void createMemcpy(OpBuilder &b, Location loc, Value tensorSource, const linalg::BufferizeToAllocationOptions &options) { auto tensorType = dyn_cast(tensorSource.getType()); assert(tensorType && "expected ranked tensor"); - assert(memrefDest.getType().isa() && "expected ranked memref"); + assert(isa(memrefDest.getType()) && "expected ranked memref"); switch (options.memcpyOp) { case linalg::BufferizeToAllocationOptions::MemcpyOp:: @@ -496,10 +496,10 @@ Value linalg::bufferizeToAllocation( if (op == nestedOp) return; if (llvm::any_of(nestedOp->getOperands(), - [](Value v) { return v.getType().isa(); })) + [](Value v) { return isa(v.getType()); })) llvm_unreachable("ops with nested tensor ops are not supported yet"); if (llvm::any_of(nestedOp->getResults(), - [](Value v) { return v.getType().isa(); })) + [](Value v) { return isa(v.getType()); })) llvm_unreachable("ops with nested tensor ops are not supported yet"); }); } @@ -508,7 +508,7 @@ Value linalg::bufferizeToAllocation( // Gather tensor results. SmallVector tensorResults; for (OpResult result : op->getResults()) { - if (!result.getType().isa()) + if (!isa(result.getType())) continue; // Unranked tensors are not supported if (!isa(result.getType())) diff --git a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp index 81669a1807796c..4776883ed95c5c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/EliminateEmptyTensors.cpp @@ -49,7 +49,7 @@ LogicalResult linalg::linalgOpAnchoredEmptyTensorEliminationStep( for (OpOperand *in : op.getDpsInputOperands()) { // Skip non-tensor operands. - if (!in->get().getType().isa()) + if (!isa(in->get().getType())) continue; // Find tensor.empty ops on the reverse SSA use-def chain. Only follow diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index 462f692615faad..df4089d61bfd72 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -405,7 +405,7 @@ static FailureOr tileToForallOpImpl( for (OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) { // Swap tensor inits with the corresponding block argument of the // scf.forall op. Memref inits remain as is. - if (outOperand.get().getType().isa()) { + if (isa(outOperand.get().getType())) { auto *it = llvm::find(dest, outOperand.get()); assert(it != dest.end() && "could not find destination tensor"); unsigned destNum = std::distance(dest.begin(), it); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index a17bc8e4cd318f..2297bf5e355125 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -557,7 +557,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, Value dest = tensor::PackOp::createDestinationTensor( rewriter, loc, operand, innerPackSizes, innerPos, /*outerDimsPerm=*/{}); - ShapedType operandType = operand.getType().cast(); + ShapedType operandType = cast(operand.getType()); bool areConstantTiles = llvm::all_of(innerPackSizes, [](OpFoldResult tile) { return getConstantIntValue(tile).has_value(); @@ -565,7 +565,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, if (areConstantTiles && operandType.hasStaticShape() && !tensor::PackOp::requirePaddingValue( operandType.getShape(), innerPos, - dest.getType().cast().getShape(), {}, + cast(dest.getType()).getShape(), {}, innerPackSizes)) { packOps.push_back(rewriter.create( loc, operand, dest, innerPos, innerPackSizes)); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index df61381432921b..fbff2088637f44 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -3410,8 +3410,8 @@ struct Conv1DGenerator // * shape_cast(broadcast(filter)) // * broadcast(shuffle(filter)) // Opt for the option without shape_cast to simplify the codegen. - auto rhsSize = rhs.getType().cast().getShape()[0]; - auto resSize = res.getType().cast().getShape()[1]; + auto rhsSize = cast(rhs.getType()).getShape()[0]; + auto resSize = cast(res.getType()).getShape()[1]; SmallVector indicies; for (int i = 0; i < resSize / rhsSize; ++i) { diff --git a/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp b/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp index b3481ce1c56bbd..3c9475c2d143a6 100644 --- a/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp +++ b/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp @@ -173,8 +173,8 @@ transform::MemRefAllocaToGlobalOp::apply(transform::TransformRewriter &rewriter, } // Assemble results. - results.set(getGlobal().cast(), globalOps); - results.set(getGetGlobal().cast(), getGlobalOps); + results.set(cast(getGlobal()), globalOps); + results.set(cast(getGetGlobal()), getGlobalOps); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp index 8236a4c475f17c..4449733f0daf06 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @@ -254,7 +254,7 @@ struct ConvertMemRefLoad final : OpConversionPattern { LogicalResult matchAndRewrite(memref::LoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto convertedType = adaptor.getMemref().getType().cast(); + auto convertedType = cast(adaptor.getMemref().getType()); auto convertedElementType = convertedType.getElementType(); auto oldElementType = op.getMemRefType().getElementType(); int srcBits = oldElementType.getIntOrFloatBitWidth(); @@ -351,7 +351,7 @@ struct ConvertMemrefStore final : OpConversionPattern { LogicalResult matchAndRewrite(memref::StoreOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto convertedType = adaptor.getMemref().getType().cast(); + auto convertedType = cast(adaptor.getMemref().getType()); int srcBits = op.getMemRefType().getElementTypeBitWidth(); int dstBits = convertedType.getElementTypeBitWidth(); auto dstIntegerType = rewriter.getIntegerType(dstBits); diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandRealloc.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandRealloc.cpp index 62a8f7e43c8675..dcc5eac916d034 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/ExpandRealloc.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandRealloc.cpp @@ -68,7 +68,7 @@ struct ExpandReallocOpPattern : public OpRewritePattern { // Get the size of the original buffer. int64_t inputSize = - op.getSource().getType().cast().getDimSize(0); + cast(op.getSource().getType()).getDimSize(0); OpFoldResult currSize = rewriter.getIndexAttr(inputSize); if (ShapedType::isDynamic(inputSize)) { Value dimZero = getValueOrCreateConstantIndexOp(rewriter, loc, @@ -79,7 +79,7 @@ struct ExpandReallocOpPattern : public OpRewritePattern { // Get the requested size that the new buffer should have. int64_t outputSize = - op.getResult().getType().cast().getDimSize(0); + cast(op.getResult().getType()).getDimSize(0); OpFoldResult targetSize = ShapedType::isDynamic(outputSize) ? OpFoldResult{op.getDynamicResultSize()} : rewriter.getIndexAttr(outputSize); @@ -127,7 +127,7 @@ struct ExpandReallocOpPattern : public OpRewritePattern { // is already bigger than the requested size, the cast represents a // subview operation. Value casted = builder.create( - loc, op.getResult().getType().cast(), op.getSource(), + loc, cast(op.getResult().getType()), op.getSource(), rewriter.getIndexAttr(0), ArrayRef{targetSize}, ArrayRef{rewriter.getIndexAttr(1)}); builder.create(loc, casted); diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 03f11ad1f94965..d4329b401df191 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -169,7 +169,7 @@ ShapedType mesh::shardShapedType(ShapedType shape, MeshOp mesh, } Type mesh::shardType(Type type, MeshOp mesh, MeshShardingAttr sharding) { - RankedTensorType rankedTensorType = type.dyn_cast(); + RankedTensorType rankedTensorType = dyn_cast(type); if (rankedTensorType) { return shardShapedType(rankedTensorType, mesh, sharding); } @@ -281,7 +281,8 @@ MeshShardingAttr::verify(function_ref emitError, } bool MeshShardingAttr::operator==(Attribute rhs) const { - MeshShardingAttr rhsAsMeshShardingAttr = rhs.dyn_cast(); + MeshShardingAttr rhsAsMeshShardingAttr = + mlir::dyn_cast(rhs); return rhsAsMeshShardingAttr && *this == rhsAsMeshShardingAttr; } @@ -484,15 +485,15 @@ static LogicalResult verifyDimensionCompatibility(Location loc, static LogicalResult verifyGatherOperandAndResultShape( Value operand, Value result, int64_t gatherAxis, ArrayRef meshAxes, ArrayRef meshShape) { - auto resultRank = result.getType().template cast().getRank(); + auto resultRank = cast(result.getType()).getRank(); if (gatherAxis < 0 || gatherAxis >= resultRank) { return emitError(result.getLoc()) << "Gather axis " << gatherAxis << " is out of bounds [0, " << resultRank << ")."; } - ShapedType operandType = operand.getType().cast(); - ShapedType resultType = result.getType().cast(); + ShapedType operandType = cast(operand.getType()); + ShapedType resultType = cast(result.getType()); auto deviceGroupSize = DimensionSize(collectiveProcessGroupSize(meshAxes, meshShape)); for (int64_t axis = 0; axis < operandType.getRank(); ++axis) { @@ -511,8 +512,8 @@ static LogicalResult verifyGatherOperandAndResultShape( static LogicalResult verifyAllToAllOperandAndResultShape( Value operand, Value result, int64_t splitAxis, int64_t concatAxis, ArrayRef meshAxes, ArrayRef meshShape) { - ShapedType operandType = operand.getType().cast(); - ShapedType resultType = result.getType().cast(); + ShapedType operandType = cast(operand.getType()); + ShapedType resultType = cast(result.getType()); for (int64_t axis = 0; axis < operandType.getRank(); ++axis) { if ((axis != splitAxis && axis != concatAxis) || splitAxis == concatAxis) { if (failed(verifyDimensionCompatibility( @@ -556,8 +557,8 @@ static LogicalResult verifyAllToAllOperandAndResultShape( static LogicalResult verifyScatterOrSliceOperandAndResultShape( Value operand, Value result, int64_t tensorAxis, ArrayRef meshAxes, ArrayRef meshShape) { - ShapedType operandType = operand.getType().cast(); - ShapedType resultType = result.getType().cast(); + ShapedType operandType = cast(operand.getType()); + ShapedType resultType = cast(result.getType()); for (int64_t axis = 0; axis < operandType.getRank(); ++axis) { if (axis != tensorAxis) { if (failed(verifyDimensionCompatibility( diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp index 9acee5aa8d8604..dbb9e667d4709c 100644 --- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp +++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp @@ -97,7 +97,7 @@ checkOperandAffineExpr(AffineExpr expr, unsigned numDims) { FailureOr> mesh::getMeshShardingAttr(OpResult result) { - Value val = result.cast(); + Value val = cast(result); bool anyShardedForDef = llvm::any_of(val.getUsers(), [](Operation *user) { auto shardOp = llvm::dyn_cast(user); if (!shardOp) @@ -178,7 +178,7 @@ LogicalResult mesh::ShardingInterface::verifyShardingInterfaceImpl() { return failure(); for (OpResult result : op->getResults()) { - auto resultType = result.getType().dyn_cast(); + auto resultType = dyn_cast(result.getType()); if (!resultType) return failure(); AffineMap map = maps[numOperands + result.getResultNumber()]; @@ -404,7 +404,7 @@ static LogicalResult addShardOp(OpBuilder &b, OpResult result, if (succeeded(maybeSharding) && !maybeSharding->first) return success(); - auto resultType = result.getType().cast(); + auto resultType = cast(result.getType()); SmallVector> splitAxes(resultType.getRank()); SmallVector partialAxes; @@ -457,7 +457,7 @@ static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand, if (succeeded(maybeShardingAttr) && maybeShardingAttr->first) return success(); Value operand = opOperand.get(); - auto operandType = operand.getType().cast(); + auto operandType = cast(operand.getType()); SmallVector> splitAxes(operandType.getRank()); unsigned numDims = map.getNumDims(); for (auto it : llvm::enumerate(map.getResults())) { @@ -526,7 +526,7 @@ LogicalResult mesh::detail::defaultAddShardingAnnotations( static bool isValueCompatibleWithFullReplicationSharding(Value value, MeshShardingAttr sharding) { - if (value.getType().isa()) { + if (isa(value.getType())) { return sharding && isFullReplication(sharding); } diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp index e4868435135ed1..6b1326d76bc4a4 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp @@ -86,14 +86,13 @@ handlePartialAxesDuringResharding(OpBuilder &builder, } builder.setInsertionPointAfterValue(sourceShard); - TypedValue resultValue = + TypedValue resultValue = cast>( builder .create(sourceShard.getLoc(), sourceShard.getType(), sourceSharding.getMesh().getLeafReference(), allReduceMeshAxes, sourceShard, sourceSharding.getPartialType()) - .getResult() - .cast>(); + .getResult()); llvm::SmallVector remainingPartialAxes; llvm::copy_if(sourceShardingPartialAxesSet, @@ -135,13 +134,12 @@ splitLastAxisInResharding(ImplicitLocOpBuilder &builder, MeshShardingAttr sourceSharding, TypedValue sourceShard, MeshOp mesh, int64_t splitTensorAxis, MeshAxis splitMeshAxis) { - TypedValue targetShard = + TypedValue targetShard = cast>( builder .create(sourceShard, mesh, ArrayRef(splitMeshAxis), splitTensorAxis) - .getResult() - .cast>(); + .getResult()); MeshShardingAttr targetSharding = targetShardingInSplitLastAxis( builder.getContext(), sourceSharding, splitTensorAxis, splitMeshAxis); return {targetShard, targetSharding}; @@ -278,10 +276,8 @@ unsplitLastAxisInResharding(ImplicitLocOpBuilder &builder, APInt(64, splitTensorAxis)); ShapedType targetShape = shardShapedType(sourceUnshardedShape, mesh, targetSharding); - TypedValue targetShard = - builder.create(targetShape, allGatherResult) - .getResult() - .cast>(); + TypedValue targetShard = cast>( + builder.create(targetShape, allGatherResult).getResult()); return {targetShard, targetSharding}; } @@ -413,10 +409,8 @@ moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh, APInt(64, targetTensorAxis), APInt(64, sourceTensorAxis)); ShapedType targetShape = shardShapedType(sourceUnshardedShape, mesh, targetSharding); - TypedValue targetShard = - builder.create(targetShape, allToAllResult) - .getResult() - .cast>(); + TypedValue targetShard = cast>( + builder.create(targetShape, allToAllResult).getResult()); return {targetShard, targetSharding}; } @@ -505,7 +499,7 @@ TypedValue reshard(OpBuilder &builder, MeshOp mesh, ShardOp source, ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder); return reshard( implicitLocOpBuilder, mesh, source.getShard(), target.getShard(), - source.getSrc().cast>(), sourceShardValue); + cast>(source.getSrc()), sourceShardValue); } TypedValue reshard(OpBuilder &builder, ShardOp source, @@ -533,23 +527,22 @@ SmallVector shardedBlockArgumentTypes(Block &block, SymbolTableCollection &symbolTableCollection) { SmallVector res; - llvm::transform(block.getArguments(), std::back_inserter(res), - [&symbolTableCollection](BlockArgument arg) { - auto rankedTensorArg = - arg.dyn_cast>(); - if (!rankedTensorArg) { - return arg.getType(); - } - - assert(rankedTensorArg.hasOneUse()); - Operation *useOp = *rankedTensorArg.getUsers().begin(); - ShardOp shardOp = llvm::dyn_cast(useOp); - assert(shardOp); - MeshOp mesh = getMesh(shardOp, symbolTableCollection); - return shardShapedType(rankedTensorArg.getType(), mesh, - shardOp.getShardAttr()) - .cast(); - }); + llvm::transform( + block.getArguments(), std::back_inserter(res), + [&symbolTableCollection](BlockArgument arg) { + auto rankedTensorArg = dyn_cast>(arg); + if (!rankedTensorArg) { + return arg.getType(); + } + + assert(rankedTensorArg.hasOneUse()); + Operation *useOp = *rankedTensorArg.getUsers().begin(); + ShardOp shardOp = llvm::dyn_cast(useOp); + assert(shardOp); + MeshOp mesh = getMesh(shardOp, symbolTableCollection); + return cast(shardShapedType(rankedTensorArg.getType(), mesh, + shardOp.getShardAttr())); + }); return res; } @@ -587,7 +580,7 @@ static SmallVector getOperandShardings(Operation &op) { res.reserve(op.getNumOperands()); llvm::transform(op.getOperands(), std::back_inserter(res), [](Value operand) { TypedValue rankedTensor = - operand.dyn_cast>(); + dyn_cast>(operand); if (!rankedTensor) { return MeshShardingAttr(); } @@ -608,7 +601,7 @@ static SmallVector getResultShardings(Operation &op) { llvm::transform(op.getResults(), std::back_inserter(res), [](OpResult result) { TypedValue rankedTensor = - result.dyn_cast>(); + dyn_cast>(result); if (!rankedTensor) { return MeshShardingAttr(); } @@ -636,9 +629,8 @@ spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap, } else { // Insert resharding. assert(!srcShardOp.getAnnotateForUsers() && shardOp.getAnnotateForUsers()); - TypedValue srcSpmdValue = - spmdizationMap.lookup(srcShardOp.getOperand()) - .cast>(); + TypedValue srcSpmdValue = cast>( + spmdizationMap.lookup(srcShardOp.getOperand())); targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue, symbolTableCollection); } diff --git a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp index cb13ee404751ca..60c4e07a118cb8 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp @@ -133,7 +133,7 @@ struct AllSliceOpLowering // insert tensor.extract_slice RankedTensorType operandType = - op.getOperand().getType().cast(); + cast(op.getOperand().getType()); SmallVector sizes; for (int64_t i = 0; i < operandType.getRank(); ++i) { if (i == sliceAxis) { @@ -202,10 +202,9 @@ createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef axes, ImplicitLocOpBuilder &builder) { Operation::result_range meshShape = builder.create(mesh, axes).getResults(); - return arith::createProduct(builder, builder.getLoc(), - llvm::to_vector_of(meshShape), - builder.getIndexType()) - .cast>(); + return cast>(arith::createProduct( + builder, builder.getLoc(), llvm::to_vector_of(meshShape), + builder.getIndexType())); } TypedValue createProcessLinearIndex(StringRef mesh, diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 1635297a5447d4..4e256aea0be37a 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -651,7 +651,7 @@ struct MmaSyncBuilder { template static void foreachIndividualVectorElement(Value vector, ApplyFn applyFn, ReduceFn reduceFn) { - VectorType vectorType = vector.getType().cast(); + VectorType vectorType = cast(vector.getType()); auto vectorShape = vectorType.getShape(); auto strides = computeStrides(vectorShape); for (int64_t idx = 0, e = vectorShape[0] * strides[0]; idx < e; ++idx) { @@ -779,11 +779,11 @@ FailureOr MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) { Value lhsMemRef = linalgOp.getDpsInputOperand(0)->get(); Value rhsMemRef = linalgOp.getDpsInputOperand(1)->get(); Value resMemRef = linalgOp.getDpsInitOperand(0)->get(); - assert(lhsMemRef.getType().cast().getRank() == 2 && + assert(cast(lhsMemRef.getType()).getRank() == 2 && "expected lhs to be a 2D memref"); - assert(rhsMemRef.getType().cast().getRank() == 2 && + assert(cast(rhsMemRef.getType()).getRank() == 2 && "expected rhs to be a 2D memref"); - assert(resMemRef.getType().cast().getRank() == 2 && + assert(cast(resMemRef.getType()).getRank() == 2 && "expected res to be a 2D memref"); int64_t m = cast(lhsMemRef.getType()).getShape()[0]; diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 1e480d6471cbce..528a0d05b1011b 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1318,7 +1318,7 @@ static LogicalResult verifyPrivateVarList(OpType &op) { for (auto privateVarInfo : llvm::zip_equal(privateVars, privatizers)) { Type varType = std::get<0>(privateVarInfo).getType(); SymbolRefAttr privatizerSym = - std::get<1>(privateVarInfo).template cast(); + cast(std::get<1>(privateVarInfo)); PrivateClauseOp privatizerOp = SymbolTable::lookupNearestSymbolFrom(op, privatizerSym); @@ -1344,6 +1344,22 @@ static LogicalResult verifyPrivateVarList(OpType &op) { } LogicalResult ParallelOp::verify() { + // Check that it is a valid loop wrapper if it's taking that role. + if (isa((*this)->getParentOp())) { + if (!isWrapper()) + return emitOpError() << "must take a loop wrapper role if nested inside " + "of 'omp.distribute'"; + + if (LoopWrapperInterface nested = getNestedWrapper()) { + // Check for the allowed leaf constructs that may appear in a composite + // construct directly after PARALLEL. + if (!isa(nested)) + return emitError() << "only supported nested wrapper is 'omp.wsloop'"; + } else { + return emitOpError() << "must not wrap an 'omp.loop_nest' directly"; + } + } + if (getAllocateVars().size() != getAllocatorsVars().size()) return emitError( "expected equal sizes for allocate and allocator variables"); diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp index 481275f052a3ce..187b3b71e3458b 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp @@ -145,9 +145,9 @@ void VarInfo::setNum(Var::Num n) { /// mismatches. LLVM_ATTRIBUTE_UNUSED static llvm::SMLoc minSMLoc(AsmParser &parser, llvm::SMLoc sm1, llvm::SMLoc sm2) { - const auto loc1 = parser.getEncodedSourceLoc(sm1).dyn_cast(); + const auto loc1 = dyn_cast(parser.getEncodedSourceLoc(sm1)); assert(loc1 && "Could not get `FileLineColLoc` for first `SMLoc`"); - const auto loc2 = parser.getEncodedSourceLoc(sm2).dyn_cast(); + const auto loc2 = dyn_cast(parser.getEncodedSourceLoc(sm2)); assert(loc2 && "Could not get `FileLineColLoc` for second `SMLoc`"); if (loc1.getFilename() != loc2.getFilename()) return SMLoc(); diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 516b0943bdcfac..b1d44559fa5aba 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -2078,7 +2078,7 @@ struct SparseTensorAsmDialectInterface : public OpAsmDialectInterface { using OpAsmDialectInterface::OpAsmDialectInterface; AliasResult getAlias(Attribute attr, raw_ostream &os) const override { - if (attr.isa()) { + if (isa(attr)) { os << "sparse"; return AliasResult::OverridableAlias; } diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp index 4f9988d48d7710..9c84f4c25866fd 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp @@ -29,7 +29,7 @@ LogicalResult sparse_tensor::detail::stageWithSortImpl( Location loc = op.getLoc(); Type finalTp = op->getOpResult(0).getType(); - SparseTensorType dstStt(finalTp.cast()); + SparseTensorType dstStt(cast(finalTp)); Type srcCOOTp = dstStt.getCOOType(/*ordered=*/false); // Clones the original operation but changing the output to an unordered COO. diff --git a/mlir/lib/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp b/mlir/lib/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp index 5b7ea9360e2211..ca19259ebffa68 100644 --- a/mlir/lib/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp +++ b/mlir/lib/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp @@ -25,7 +25,7 @@ DiagnosedSilenceableFailure transform::MatchSparseInOut::matchOperation( return emitSilenceableFailure(current->getLoc(), "operation has no sparse input or output"); } - results.set(getResult().cast(), state.getPayloadOps(getTarget())); + results.set(cast(getResult()), state.getPayloadOps(getTarget())); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp index eafbe95b7aebe0..a53bce16dad860 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseAssembler.cpp @@ -42,7 +42,7 @@ static void convTypes(TypeRange types, SmallVectorImpl &convTypes, if (kind == SparseTensorFieldKind::PosMemRef || kind == SparseTensorFieldKind::CrdMemRef || kind == SparseTensorFieldKind::ValMemRef) { - auto rtp = t.cast(); + auto rtp = cast(t); if (!directOut) { rtp = RankedTensorType::get(rtp.getShape(), rtp.getElementType()); if (extraTypes) @@ -97,7 +97,7 @@ static void convVals(OpBuilder &builder, Location loc, TypeRange types, mem = builder.create(loc, inputs[0]); toVals.push_back(mem); } else { - ShapedType rtp = t.cast(); + ShapedType rtp = cast(t); rtp = RankedTensorType::get(rtp.getShape(), rtp.getElementType()); inputs.push_back(extraVals[extra++]); retTypes.push_back(rtp); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp index 9c0fc60877d8a3..36ecf692b02c51 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp @@ -502,7 +502,7 @@ struct GenericOpScheduler : public OpRewritePattern { for (const AffineExpr l : order.getResults()) { unsigned loopId = llvm::cast(l).getPosition(); auto itTp = - linalgOp.getIteratorTypes()[loopId].cast(); + cast(linalgOp.getIteratorTypes()[loopId]); if (linalg::isReductionIterator(itTp.getValue())) break; // terminate at first reduction nest++; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp index b117c1694e45b8..02375f54d7152f 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -476,8 +476,8 @@ struct GenSemiRingSelect : public OpRewritePattern { if (!sel) return std::nullopt; - auto tVal = sel.getTrueValue().dyn_cast(); - auto fVal = sel.getFalseValue().dyn_cast(); + auto tVal = dyn_cast(sel.getTrueValue()); + auto fVal = dyn_cast(sel.getFalseValue()); // TODO: For simplicity, we only handle cases where both true/false value // are directly loaded the input tensor. We can probably admit more cases // in theory. @@ -487,7 +487,7 @@ struct GenSemiRingSelect : public OpRewritePattern { // Helper lambda to determine whether the value is loaded from a dense input // or is a loop invariant. auto isValFromDenseInputOrInvariant = [&op](Value v) -> bool { - if (auto bArg = v.dyn_cast(); + if (auto bArg = dyn_cast(v); bArg && !isSparseTensor(op.getDpsInputOperand(bArg.getArgNumber()))) return true; // If the value is defined outside the loop, it is a loop invariant. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp index 89af75dea2a0f2..de553a5f9bf08c 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/CodegenUtils.cpp @@ -165,7 +165,7 @@ Value sparse_tensor::genCast(OpBuilder &builder, Location loc, Value value, Value sparse_tensor::genScalarToTensor(OpBuilder &builder, Location loc, Value elem, Type dstTp) { - if (auto rtp = dstTp.dyn_cast()) { + if (auto rtp = dyn_cast(dstTp)) { // Scalars can only be converted to 0-ranked tensors. assert(rtp.getRank() == 0); elem = sparse_tensor::genCast(builder, loc, elem, rtp.getElementType()); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp index 66f96ba08c0ed2..8981de58306da3 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp @@ -157,8 +157,7 @@ IterationGraphSorter::IterationGraphSorter( // The number of results of the map should match the rank of the tensor. assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) { auto [m, v] = mvPair; - return m.getNumResults() == - v.getType().template cast().getRank(); + return m.getNumResults() == cast(v.getType()).getRank(); })); itGraph.resize(getNumLoops(), std::vector(getNumLoops(), false)); diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 0ce40e81371209..3ff41ab22fbc42 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -820,7 +820,7 @@ struct DimOfDestStyleOp : public OpRewritePattern { if (!destOp) return failure(); - auto resultIndex = source.cast().getResultNumber(); + auto resultIndex = cast(source).getResultNumber(); auto *initOperand = destOp.getDpsInitOperand(resultIndex); rewriter.modifyOpInPlace( @@ -1580,6 +1580,41 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) { llvm::dyn_cast_if_present(adaptor.getSource()), getResult().getType())) return reshapedSource; + + auto source = getSource(); + auto sourceTy = dyn_cast(source.getType()); + auto resultTy = dyn_cast(getType()); + + if (!sourceTy || !resultTy || sourceTy != resultTy) + return {}; + + if (auto fromElements = getShape().getDefiningOp()) { + auto elements = fromElements.getElements(); + bool dynamicNoop = + sourceTy.getRank() == static_cast(elements.size()); + for (int id = 0, s = elements.size(); id < s && dynamicNoop; ++id) { + auto element = elements[id]; + + if (auto cst = getConstantIntValue(element)) { + dynamicNoop &= cst.value() == sourceTy.getDimSize(id); + continue; + } + + if (auto dimOp = element.getDefiningOp()) { + dynamicNoop &= dimOp.getSource() == source; + + APSInt dim; + auto cst = getConstantIntValue(dimOp.getIndex()); + dynamicNoop &= + cst.has_value() && cst.value() == static_cast(id); + continue; + } + } + + if (dynamicNoop) + return source; + } + return {}; } @@ -3475,7 +3510,7 @@ SplatOp::reifyResultShapes(OpBuilder &builder, OpFoldResult SplatOp::fold(FoldAdaptor adaptor) { auto constOperand = adaptor.getInput(); - if (!constOperand.isa_and_nonnull()) + if (!isa_and_nonnull(constOperand)) return {}; // Do not fold if the splat is not statically shaped @@ -4307,7 +4342,7 @@ LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, /// unpack(destinationStyleOp(x)) -> unpack(x) if (auto dstStyleOp = unPackOp.getDest().getDefiningOp()) { - auto destValue = unPackOp.getDest().cast(); + auto destValue = cast(unPackOp.getDest()); Value newDest = dstStyleOp.getDpsInits()[destValue.getResultNumber()]; rewriter.modifyOpInPlace(unPackOp, [&]() { unPackOp.setDpsInitOperand(0, newDest); }); diff --git a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp index 06a441dbeaf150..137156fe1a73e2 100644 --- a/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tosa/IR/ShardingInterfaceImpl.cpp @@ -32,7 +32,7 @@ namespace { struct MatMulOpSharding : public ShardingInterface::ExternalModel { SmallVector getLoopIteratorTypes(Operation *op) const { - auto tensorType = op->getResult(0).getType().dyn_cast(); + auto tensorType = dyn_cast(op->getResult(0).getType()); if (!tensorType) return {}; @@ -48,7 +48,7 @@ struct MatMulOpSharding } SmallVector getIndexingMaps(Operation *op) const { - auto tensorType = op->getResult(0).getType().dyn_cast(); + auto tensorType = dyn_cast(op->getResult(0).getType()); if (!tensorType) return {}; MLIRContext *ctx = op->getContext(); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index c8bf4c526b239f..c139d5f60024ca 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -285,7 +285,7 @@ struct ClampIsNoOp : public OpRewritePattern { return failure(); } - if (inputElementType.isa()) { + if (isa(inputElementType)) { // Unlike integer types, floating point types can represent infinity. auto minClamp = op.getMinFp(); auto maxClamp = op.getMaxFp(); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index e06ac9a27ae4cc..10e6016a1ed431 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -168,7 +168,7 @@ ParseResult mlir::tosa::parseTypeOrAttr(OpAsmParser &parser, TypeAttr &typeAttr, return parser.emitError(parser.getCurrentLocation()) << "expected attribute"; } - if (auto typedAttr = attr.dyn_cast()) { + if (auto typedAttr = dyn_cast(attr)) { typeAttr = TypeAttr::get(typedAttr.getType()); } return success(); @@ -186,7 +186,7 @@ ParseResult mlir::tosa::parseTypeOrAttr(OpAsmParser &parser, TypeAttr &typeAttr, void mlir::tosa::printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type, Attribute attr) { bool needsSpace = false; - auto typedAttr = attr.dyn_cast_or_null(); + auto typedAttr = dyn_cast_or_null(attr); if (!typedAttr || typedAttr.getType() != type.getValue()) { p << ": "; p.printAttribute(type); diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp index 6575b39fd45a1f..6eef2c5018d6d7 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp @@ -371,7 +371,7 @@ struct ReduceConstantOptimization : public OpRewritePattern { auto reductionAxis = op.getAxis(); const auto denseElementsAttr = constOp.getValue(); const auto shapedOldElementsValues = - denseElementsAttr.getType().cast(); + cast(denseElementsAttr.getType()); if (!llvm::isa(shapedOldElementsValues.getElementType())) return rewriter.notifyMatchFailure( diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index 74ef6381f3d701..c99f62d5ae1124 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -357,7 +357,7 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { bool levelCheckTransposeConv2d(Operation *op) { if (auto transpose = dyn_cast(op)) { if (ShapedType filterType = - transpose.getFilter().getType().dyn_cast()) { + dyn_cast(transpose.getFilter().getType())) { auto shape = filterType.getShape(); assert(shape.size() == 4); // level check kernel sizes for kH and KW diff --git a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp index 4b3e28e4313c64..94d4a96a07ad64 100644 --- a/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp +++ b/mlir/lib/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp @@ -21,16 +21,15 @@ DiagnosedSilenceableFailure transform::DebugEmitRemarkAtOp::apply(transform::TransformRewriter &rewriter, transform::TransformResults &results, transform::TransformState &state) { - if (getAt().getType().isa()) { + if (isa(getAt().getType())) { auto payload = state.getPayloadOps(getAt()); for (Operation *op : payload) op->emitRemark() << getMessage(); return DiagnosedSilenceableFailure::success(); } - assert( - getAt().getType().isa() && - "unhandled kind of transform type"); + assert(isa(getAt().getType()) && + "unhandled kind of transform type"); auto describeValue = [](Diagnostic &os, Value value) { os << "value handle points to "; diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index 53f958caa0bdb7..7a5a6974700586 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -1615,7 +1615,7 @@ transform::GetTypeOp::apply(transform::TransformRewriter &rewriter, } params.push_back(TypeAttr::get(type)); } - results.setParams(getResult().cast(), params); + results.setParams(cast(getResult()), params); return DiagnosedSilenceableFailure::success(); } @@ -2217,14 +2217,14 @@ transform::NumAssociationsOp::apply(transform::TransformRewriter &rewriter, llvm_unreachable("unknown kind of transform dialect type"); return 0; }); - results.setParams(getNum().cast(), + results.setParams(cast(getNum()), rewriter.getI64IntegerAttr(numAssociations)); return DiagnosedSilenceableFailure::success(); } LogicalResult transform::NumAssociationsOp::verify() { // Verify that the result type accepts an i64 attribute as payload. - auto resultType = getNum().getType().cast(); + auto resultType = cast(getNum().getType()); return resultType .checkPayload(getLoc(), {Builder(getContext()).getI64IntegerAttr(0)}) .checkAndReport(); diff --git a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp index 8d9f105d1c5dba..9a24c2baebabb2 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp @@ -44,7 +44,7 @@ DiagnosedSilenceableFailure transform::AffineMapParamType::checkPayload(Location loc, ArrayRef payload) const { for (Attribute attr : payload) { - if (!attr.isa()) { + if (!mlir::isa(attr)) { return emitSilenceableError(loc) << "expected affine map attribute, got " << attr; } @@ -144,7 +144,7 @@ DiagnosedSilenceableFailure transform::TypeParamType::checkPayload(Location loc, ArrayRef payload) const { for (Attribute attr : payload) { - if (!attr.isa()) { + if (!mlir::isa(attr)) { return emitSilenceableError(loc) << "expected type attribute, got " << attr; } diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 3e6425879cc67f..d10a31941db4f6 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -6169,7 +6169,7 @@ void mlir::vector::populateVectorToVectorCanonicalizationPatterns( OpFoldResult SplatOp::fold(FoldAdaptor adaptor) { auto constOperand = adaptor.getInput(); - if (!constOperand.isa_and_nonnull()) + if (!isa_and_nonnull(constOperand)) return {}; // SplatElementsAttr::get treats single value for second arg as being a splat. diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp index 0693aa596cb28f..b30b43d70bf0f4 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTransfer.cpp @@ -57,7 +57,7 @@ static Value extendMaskRank(OpBuilder &builder, Location loc, Value vec, Value broadcasted = extendVectorRank(builder, loc, vec, addedRank); SmallVector permutation; for (int64_t i = addedRank, - e = broadcasted.getType().cast().getRank(); + e = cast(broadcasted.getType()).getRank(); i < e; ++i) permutation.push_back(i); for (int64_t i = 0; i < addedRank; ++i) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp index 8d733c5a8849b6..7ed3dea42b7715 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp @@ -403,7 +403,7 @@ mlir::vector::castAwayContractionLeadingOneDim(vector::ContractionOp contractOp, // Such transposes do not materially effect the underlying vector and can // be omitted. EG: perm [1, 0, 2] applied to vector<1x1x8xi32> bool transposeNonOuterUnitDims = false; - auto operandShape = operands[it.index()].getType().cast(); + auto operandShape = cast(operands[it.index()].getType()); for (auto [index, dim] : llvm::enumerate(ArrayRef(perm).drop_back(1))) { if (dim != static_cast(index) && diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index dc6f126aae4c87..d24721f3defa65 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -63,7 +63,7 @@ static FailureOr getCompressedMaskOp(OpBuilder &rewriter, // new mask index) only happens on the last dimension of the vectors. Operation *newMask = nullptr; SmallVector shape( - maskOp->getResultTypes()[0].cast().getShape()); + cast(maskOp->getResultTypes()[0]).getShape()); shape.back() = numElements; auto newMaskType = VectorType::get(shape, rewriter.getI1Type()); if (createMaskOp) { diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp index b844c2bfa837ce..ee622e886f6185 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferSplitRewritePatterns.cpp @@ -171,7 +171,7 @@ static MemRefType getCastCompatibleMemRefType(MemRefType aT, MemRefType bT) { /// is first inserted, followed by a `memref.cast`. static Value castToCompatibleMemRefType(OpBuilder &b, Value memref, MemRefType compatibleMemRefType) { - MemRefType sourceType = memref.getType().cast(); + MemRefType sourceType = cast(memref.getType()); Value res = memref; if (sourceType.getMemorySpace() != compatibleMemRefType.getMemorySpace()) { sourceType = MemRefType::get( diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt index f33061b2d87cff..9f57627c321fb0 100644 --- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 530c50ef74f7a0..23c5749c2309de 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -127,7 +127,7 @@ LogicalResult CreateNdDescOp::verify() { // check source type matches the rank if it is a memref. // It also should have the same ElementType as TensorDesc. - auto memrefTy = getSourceType().dyn_cast(); + auto memrefTy = dyn_cast(getSourceType()); if (memrefTy) { invalidRank |= (memrefTy.getRank() != rank); invalidElemTy |= memrefTy.getElementType() != getElementType(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt new file mode 100644 index 00000000000000..7fb64d3b97b87d --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -0,0 +1,17 @@ +add_mlir_dialect_library(MLIRXeGPUTransforms + XeGPUFoldAliasOps.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU + + DEPENDS + MLIRXeGPUPassIncGen + + LINK_LIBS PUBLIC + MLIRAffineUtils + MLIRIR + MLIRMemRefDialect + MLIRXeGPUDialect + MLIRPass + MLIRTransforms +) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp new file mode 100644 index 00000000000000..9307e8eb784b54 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp @@ -0,0 +1,82 @@ +//===- XeGPUFoldAliasOps.cpp - XeGPU alias ops folders ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" + +#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/Support/Debug.h" + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUFOLDALIASOPS +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-fold-alias-ops" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; + +namespace { +/// Merges subview operation with xegpu.create_nd_tdesc operation. +class XegpuCreateNdDescOpSubViewOpFolder final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(xegpu::CreateNdDescOp descOp, + PatternRewriter &rewriter) const override; +}; +} // namespace + +LogicalResult XegpuCreateNdDescOpSubViewOpFolder::matchAndRewrite( + xegpu::CreateNdDescOp descOp, PatternRewriter &rewriter) const { + auto subViewOp = descOp.getSource().getDefiningOp(); + + if (!subViewOp) + return rewriter.notifyMatchFailure(descOp, "not a subview producer"); + if (!subViewOp.hasUnitStride()) + return rewriter.notifyMatchFailure(descOp, "requires unit strides"); + + SmallVector resolvedOffsets; + affine::resolveIndicesIntoOpWithOffsetsAndStrides( + rewriter, descOp.getLoc(), subViewOp.getMixedOffsets(), + subViewOp.getMixedStrides(), subViewOp.getDroppedDims(), + descOp.getMixedOffsets(), resolvedOffsets); + + rewriter.replaceOpWithNewOp( + descOp, descOp.getTensorDesc().getType(), subViewOp.getSource(), + getAsOpFoldResult(resolvedOffsets)); + + return success(); +} + +void xegpu::populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +namespace { + +struct XeGPUFoldAliasOpsPass final + : public xegpu::impl::XeGPUFoldAliasOpsBase { + void runOnOperation() override; +}; + +} // namespace + +void XeGPUFoldAliasOpsPass::runOnOperation() { + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUFoldAliasOpsPatterns(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); +} diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp index 6cdc2682753fc7..411ac656e4afbd 100644 --- a/mlir/lib/IR/AffineMap.cpp +++ b/mlir/lib/IR/AffineMap.cpp @@ -711,7 +711,7 @@ AffineMap mlir::foldAttributesIntoMap(Builder &b, AffineMap map, for (int64_t i = 0; i < map.getNumDims(); ++i) { if (auto attr = operands[i].dyn_cast()) { dimReplacements.push_back( - b.getAffineConstantExpr(attr.cast().getInt())); + b.getAffineConstantExpr(cast(attr).getInt())); } else { dimReplacements.push_back(b.getAffineDimExpr(numDims++)); remainingValues.push_back(operands[i].get()); @@ -721,7 +721,7 @@ AffineMap mlir::foldAttributesIntoMap(Builder &b, AffineMap map, for (int64_t i = 0; i < map.getNumSymbols(); ++i) { if (auto attr = operands[i + map.getNumDims()].dyn_cast()) { symReplacements.push_back( - b.getAffineConstantExpr(attr.cast().getInt())); + b.getAffineConstantExpr(cast(attr).getInt())); } else { symReplacements.push_back(b.getAffineSymbolExpr(numSymbols++)); remainingValues.push_back(operands[i + map.getNumDims()].get()); diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp index db903d540761b7..0feb078db297d3 100644 --- a/mlir/lib/IR/Operation.cpp +++ b/mlir/lib/IR/Operation.cpp @@ -1154,7 +1154,7 @@ LogicalResult OpTrait::impl::verifySameOperandsAndResultRank(Operation *op) { // delegate function that returns rank of shaped type with known rank auto getRank = [](const Type type) { - return type.cast().getRank(); + return cast(type).getRank(); }; auto rank = !rankedOperandTypes.empty() ? getRank(*rankedOperandTypes.begin()) diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp index e93a9efbb76c17..15cfb3dbaf7451 100644 --- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp +++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp @@ -75,10 +75,12 @@ mlir::detail::getDefaultTypeSizeInBits(Type type, const DataLayout &dataLayout, // there is no bit-packing at the moment element sizes are taken in bytes and // multiplied with 8 bits. // TODO: make this extensible. - if (auto vecType = dyn_cast(type)) - return vecType.getNumElements() / vecType.getShape().back() * - llvm::PowerOf2Ceil(vecType.getShape().back()) * - dataLayout.getTypeSize(vecType.getElementType()) * 8; + if (auto vecType = dyn_cast(type)) { + uint64_t baseSize = vecType.getNumElements() / vecType.getShape().back() * + llvm::PowerOf2Ceil(vecType.getShape().back()) * + dataLayout.getTypeSize(vecType.getElementType()) * 8; + return llvm::TypeSize::get(baseSize, vecType.isScalable()); + } if (auto typeInterface = dyn_cast(type)) return typeInterface.getTypeSizeInBits(dataLayout, params); @@ -138,9 +140,10 @@ getFloatTypeABIAlignment(FloatType fltType, const DataLayout &dataLayout, uint64_t mlir::detail::getDefaultABIAlignment( Type type, const DataLayout &dataLayout, ArrayRef params) { - // Natural alignment is the closest power-of-two number above. + // Natural alignment is the closest power-of-two number above. For scalable + // vectors, aligning them to the same as the base vector is sufficient. if (isa(type)) - return llvm::PowerOf2Ceil(dataLayout.getTypeSize(type)); + return llvm::PowerOf2Ceil(dataLayout.getTypeSize(type).getKnownMinValue()); if (auto fltType = dyn_cast(type)) return getFloatTypeABIAlignment(fltType, dataLayout, params); diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp index c521d76a429958..101add70c51e29 100644 --- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp @@ -30,6 +30,12 @@ void registerFromLLVMIRTranslation() { llvm::cl::desc("Emit expensive warnings during LLVM IR import " "(discouraged: testing only!)"), llvm::cl::init(false)); + static llvm::cl::opt dropDICompositeTypeElements( + "drop-di-composite-type-elements", + llvm::cl::desc( + "Avoid translating the elements of DICompositeTypes during " + "the LLVM IR import (discouraged: testing only!)"), + llvm::cl::init(false)); TranslateToMLIRRegistration registration( "import-llvm", "Translate LLVMIR to MLIR", @@ -51,7 +57,8 @@ void registerFromLLVMIRTranslation() { return nullptr; return translateLLVMIRToModule(std::move(llvmModule), context, - emitExpensiveWarnings); + emitExpensiveWarnings, + dropDICompositeTypeElements); }, [](DialectRegistry ®istry) { // Register the DLTI dialect used to express the data layout diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp index 4a4e1d1ecdd868..1ab55b079b5294 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp +++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp @@ -26,9 +26,11 @@ using namespace mlir; using namespace mlir::LLVM; using namespace mlir::LLVM::detail; -DebugImporter::DebugImporter(ModuleOp mlirModule) +DebugImporter::DebugImporter(ModuleOp mlirModule, + bool dropDICompositeTypeElements) : recursionPruner(mlirModule.getContext()), - context(mlirModule.getContext()), mlirModule(mlirModule) {} + context(mlirModule.getContext()), mlirModule(mlirModule), + dropDICompositeTypeElements(dropDICompositeTypeElements) {} Location DebugImporter::translateFuncLocation(llvm::Function *func) { llvm::DISubprogram *subprogram = func->getSubprogram(); @@ -69,9 +71,14 @@ DICompileUnitAttr DebugImporter::translateImpl(llvm::DICompileUnit *node) { DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) { std::optional flags = symbolizeDIFlags(node->getFlags()); SmallVector elements; - for (llvm::DINode *element : node->getElements()) { - assert(element && "expected a non-null element type"); - elements.push_back(translate(element)); + + // A vector always requires an element. + bool isVectorType = flags && bitEnumContainsAll(*flags, DIFlags::Vector); + if (isVectorType || !dropDICompositeTypeElements) { + for (llvm::DINode *element : node->getElements()) { + assert(element && "expected a non-null element type"); + elements.push_back(translate(element)); + } } // Drop the elements parameter if any of the elements are invalid. if (llvm::is_contained(elements, nullptr)) diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h index 8b22dc63456775..5f402fb0b657cc 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.h +++ b/mlir/lib/Target/LLVMIR/DebugImporter.h @@ -29,7 +29,7 @@ namespace detail { class DebugImporter { public: - DebugImporter(ModuleOp mlirModule); + DebugImporter(ModuleOp mlirModule, bool dropDICompositeTypeElements); /// Translates the given LLVM debug location to an MLIR location. Location translateLoc(llvm::DILocation *loc); @@ -184,6 +184,12 @@ class DebugImporter { MLIRContext *context; ModuleOp mlirModule; + + /// An option to control if DICompositeTypes should always be imported without + /// converting their elements. If set, the option avoids the recursive + /// traversal of composite type debug information, which can be expensive for + /// adversarial inputs. + bool dropDICompositeTypeElements; }; } // namespace detail diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index e89ff9209b034a..ebcdbc02aadd07 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2489,7 +2489,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder, auto addDevInfos = [&, fail](auto devOperands, auto devOpType) -> void { for (const auto &devOp : devOperands) { // TODO: Only LLVMPointerTypes are handled. - if (!devOp.getType().template isa()) + if (!isa(devOp.getType())) return fail(); llvm::Value *mapOpValue = moduleTranslation.lookupValue(devOp); @@ -3083,10 +3083,9 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, std::vector generatedRefs; std::vector targetTriple; - auto targetTripleAttr = - op->getParentOfType() - ->getAttr(LLVM::LLVMDialect::getTargetTripleAttrName()) - .dyn_cast_or_null(); + auto targetTripleAttr = dyn_cast_or_null( + op->getParentOfType()->getAttr( + LLVM::LLVMDialect::getTargetTripleAttrName())); if (targetTripleAttr) targetTriple.emplace_back(targetTripleAttr.data()); @@ -3328,7 +3327,7 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( attribute.getName()) .Case("omp.is_target_device", [&](Attribute attr) { - if (auto deviceAttr = attr.dyn_cast()) { + if (auto deviceAttr = dyn_cast(attr)) { llvm::OpenMPIRBuilderConfig &config = moduleTranslation.getOpenMPBuilder()->Config; config.setIsTargetDevice(deviceAttr.getValue()); @@ -3338,7 +3337,7 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( }) .Case("omp.is_gpu", [&](Attribute attr) { - if (auto gpuAttr = attr.dyn_cast()) { + if (auto gpuAttr = dyn_cast(attr)) { llvm::OpenMPIRBuilderConfig &config = moduleTranslation.getOpenMPBuilder()->Config; config.setIsGPU(gpuAttr.getValue()); @@ -3348,7 +3347,7 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( }) .Case("omp.host_ir_filepath", [&](Attribute attr) { - if (auto filepathAttr = attr.dyn_cast()) { + if (auto filepathAttr = dyn_cast(attr)) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); ompBuilder->loadOffloadInfoMetadata(filepathAttr.getValue()); @@ -3358,13 +3357,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( }) .Case("omp.flags", [&](Attribute attr) { - if (auto rtlAttr = attr.dyn_cast()) + if (auto rtlAttr = dyn_cast(attr)) return convertFlagsAttr(op, rtlAttr, moduleTranslation); return failure(); }) .Case("omp.version", [&](Attribute attr) { - if (auto versionAttr = attr.dyn_cast()) { + if (auto versionAttr = dyn_cast(attr)) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); ompBuilder->M.addModuleFlag(llvm::Module::Max, "openmp", @@ -3376,15 +3375,14 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( .Case("omp.declare_target", [&](Attribute attr) { if (auto declareTargetAttr = - attr.dyn_cast()) + dyn_cast(attr)) return convertDeclareTargetAttr(op, declareTargetAttr, moduleTranslation); return failure(); }) .Case("omp.requires", [&](Attribute attr) { - if (auto requiresAttr = - attr.dyn_cast()) { + if (auto requiresAttr = dyn_cast(attr)) { using Requires = omp::ClauseRequires; Requires flags = requiresAttr.getValue(); llvm::OpenMPIRBuilderConfig &config = diff --git a/mlir/lib/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.cpp index 8212725b5a58be..b78b002d322920 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.cpp @@ -29,8 +29,8 @@ using mlir::LLVM::detail::createIntrinsicCall; /// option around. static llvm::Type *getXlenType(Attribute opcodeAttr, LLVM::ModuleTranslation &moduleTranslation) { - auto intAttr = opcodeAttr.cast(); - unsigned xlenWidth = intAttr.getType().cast().getWidth(); + auto intAttr = cast(opcodeAttr); + unsigned xlenWidth = cast(intAttr.getType()).getWidth(); return llvm::Type::getIntNTy(moduleTranslation.getLLVMContext(), xlenWidth); } diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index af998b99d511f0..191b84acd56fae 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -155,12 +155,14 @@ getTopologicallySortedBlocks(ArrayRef basicBlocks) { ModuleImport::ModuleImport(ModuleOp mlirModule, std::unique_ptr llvmModule, - bool emitExpensiveWarnings) + bool emitExpensiveWarnings, + bool importEmptyDICompositeTypes) : builder(mlirModule->getContext()), context(mlirModule->getContext()), mlirModule(mlirModule), llvmModule(std::move(llvmModule)), iface(mlirModule->getContext()), typeTranslator(*mlirModule->getContext()), - debugImporter(std::make_unique(mlirModule)), + debugImporter(std::make_unique( + mlirModule, importEmptyDICompositeTypes)), loopAnnotationImporter( std::make_unique(*this, builder)), emitExpensiveWarnings(emitExpensiveWarnings) { @@ -625,8 +627,8 @@ void ModuleImport::setNonDebugMetadataAttrs(llvm::Instruction *inst, } } -void ModuleImport::setIntegerOverflowFlagsAttr(llvm::Instruction *inst, - Operation *op) const { +void ModuleImport::setIntegerOverflowFlags(llvm::Instruction *inst, + Operation *op) const { auto iface = cast(op); IntegerOverflowFlags value = {}; @@ -634,8 +636,7 @@ void ModuleImport::setIntegerOverflowFlagsAttr(llvm::Instruction *inst, value = bitEnumSet(value, IntegerOverflowFlags::nuw, inst->hasNoUnsignedWrap()); - auto attr = IntegerOverflowFlagsAttr::get(op->getContext(), value); - iface->setAttr(iface.getIntegerOverflowAttrName(), attr); + iface.setOverflowFlags(value); } void ModuleImport::setFastmathFlagsAttr(llvm::Instruction *inst, @@ -2092,8 +2093,8 @@ ModuleImport::translateLoopAnnotationAttr(const llvm::MDNode *node, OwningOpRef mlir::translateLLVMIRToModule(std::unique_ptr llvmModule, - MLIRContext *context, - bool emitExpensiveWarnings) { + MLIRContext *context, bool emitExpensiveWarnings, + bool dropDICompositeTypeElements) { // Preload all registered dialects to allow the import to iterate the // registered LLVMImportDialectInterface implementations and query the // supported LLVM IR constructs before starting the translation. Assumes the @@ -2109,7 +2110,7 @@ mlir::translateLLVMIRToModule(std::unique_ptr llvmModule, /*column=*/0))); ModuleImport moduleImport(module.get(), std::move(llvmModule), - emitExpensiveWarnings); + emitExpensiveWarnings, dropDICompositeTypeElements); if (failed(moduleImport.initializeImportInterface())) return {}; if (failed(moduleImport.convertDataLayout())) diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp index 1e620e46af84ea..0c1ce70f070852 100644 --- a/mlir/lib/Transforms/Mem2Reg.cpp +++ b/mlir/lib/Transforms/Mem2Reg.cpp @@ -46,7 +46,7 @@ using namespace mlir; /// Control flow can create situations where a load could be replaced by /// multiple possible stores depending on the control flow path taken. As a /// result, this pass must introduce new block arguments in some blocks to -/// accomodate for the multiple possible definitions. Each predecessor will +/// accommodate for the multiple possible definitions. Each predecessor will /// populate the block argument with the definition reached at its end. With /// this, the value stored can be well defined at block boundaries, allowing /// the propagation of replacement through blocks. @@ -109,7 +109,7 @@ struct MemorySlotPromotionInfo { /// This is a DAG structure because if an operation must eliminate some of /// its uses, it is because the defining ops of the blocking uses requested /// it. The defining ops therefore must also have blocking uses or be the - /// starting point of the bloccking uses. + /// starting point of the blocking uses. BlockingUsesMap userToBlockingUses; }; @@ -414,7 +414,7 @@ MemorySlotPromotionAnalyzer::computeInfo() { // Then, compute blocks in which two or more definitions of the allocated // variable may conflict. These blocks will need a new block argument to - // accomodate this. + // accommodate this. computeMergePoints(info.mergePoints); // The slot can be promoted if the block arguments to be created can diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index ddd5c6b5ac747b..a6c78880c8e7c8 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -79,7 +79,7 @@ declare_mlir_dialect_python_bindings( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir" TD_FILE dialects/AsyncOps.td SOURCES_GLOB dialects/async_dialect/*.py - DIALECT_NAME async_dialect) + DIALECT_NAME async) declare_mlir_dialect_python_bindings( ADD_TO_PARENT MLIRPythonSources.Dialects @@ -591,7 +591,7 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Transform.Pybind declare_mlir_python_extension(MLIRPythonExtension.AsyncDialectPasses MODULE_NAME _mlirAsyncPasses - ADD_TO_PARENT MLIRPythonSources.Dialects.async_dialect + ADD_TO_PARENT MLIRPythonSources.Dialects.async ROOT_DIR "${PYTHON_SOURCE_DIR}" SOURCES AsyncPasses.cpp diff --git a/mlir/python/mlir/dialects/async_dialect/__init__.py b/mlir/python/mlir/dialects/async_dialect/__init__.py index dcf9d6cb2638f6..6a5ecfc20956cf 100644 --- a/mlir/python/mlir/dialects/async_dialect/__init__.py +++ b/mlir/python/mlir/dialects/async_dialect/__init__.py @@ -2,4 +2,4 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from .._async_dialect_ops_gen import * +from .._async_ops_gen import * diff --git a/mlir/python/mlir/dialects/memref.py b/mlir/python/mlir/dialects/memref.py index a3d783415855e1..bc9a3a52728ad4 100644 --- a/mlir/python/mlir/dialects/memref.py +++ b/mlir/python/mlir/dialects/memref.py @@ -8,12 +8,13 @@ from ._memref_ops_gen import * from ._ods_common import _dispatch_mixed_values, MixedValues from .arith import ConstantOp, _is_integer_like_type -from ..ir import Value, MemRefType, StridedLayoutAttr, ShapedType +from ..ir import Value, MemRefType, StridedLayoutAttr, ShapedType, Operation def _is_constant_int_like(i): return ( isinstance(i, Value) + and isinstance(i.owner, Operation) and isinstance(i.owner.opview, ConstantOp) and _is_integer_like_type(i.type) ) diff --git a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir index c276a5b0c2a14b..297be91e772838 100644 --- a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir +++ b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt -test-lower-to-arm-neon -verify-diagnostics -split-input-file %s | FileCheck %s -// CHECK-LABEL: test_lower_vector_arm_neon_mixed_types +// CHECK-LABEL: vector_arm_neon_mixed_types // CHECK-SAME: %[[A0:.*]]: vector<2x8xi8>, %[[A1:.*]]: vector<2x8xi4>, %[[A2:.*]]: vector<2x2xi32> // CHECK-DAG: %[[D0:.*]] = arith.extsi %[[A1]] : vector<2x8xi4> to vector<2x8xi8> // CHECK-DAG: %[[D1:.*]] = vector.shape_cast %[[A0]] : vector<2x8xi8> to vector<16xi8> @@ -8,7 +8,7 @@ // CHECK-DAG: %[[D3:.*]] = vector.shape_cast %[[A2]] : vector<2x2xi32> to vector<4xi32> // CHECK-DAG: %[[D4:.*]] = arm_neon.intr.smmla %[[D3]], %[[D1]], %[[D2]] : vector<16xi8> to vector<4xi32> // CHECK-DAG: %[[D5:.*]] = vector.shape_cast %[[D4]] : vector<4xi32> to vector<2x2xi32> -func.func @test_lower_vector_arm_neon_mixed_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { +func.func @vector_arm_neon_mixed_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { %lhs_extsi = arith.extsi %lhs : vector<2x8xi8> to vector<2x8xi32> %rhs_extsi = arith.extsi %rhs : vector<2x8xi4> to vector<2x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32> @@ -17,14 +17,14 @@ func.func @test_lower_vector_arm_neon_mixed_types(%lhs: vector<2x8xi8>, %rhs: ve // ----- -// CHECK-LABEL: test_lower_vector_arm_neon_same_types +// CHECK-LABEL: vector_arm_neon_same_types // CHECK-SAME: %[[A0:.*]]: vector<2x8xi8>, %[[A1:.*]]: vector<2x8xi8>, %[[A2:.*]]: vector<2x2xi32> // CHECK-DAG: %[[D0:.*]] = vector.shape_cast %[[A0]] : vector<2x8xi8> to vector<16xi8> // CHECK-DAG: %[[D1:.*]] = vector.shape_cast %[[A1]] : vector<2x8xi8> to vector<16xi8> // CHECK-DAG: %[[D2:.*]] = vector.shape_cast %[[A2]] : vector<2x2xi32> to vector<4xi32> // CHECK-DAG: %[[D3:.*]] = arm_neon.intr.smmla %[[D2]], %[[D0]], %[[D1]] : vector<16xi8> to vector<4xi32> // CHECK-DAG: %[[D4:.*]] = vector.shape_cast %[[D3]] : vector<4xi32> to vector<2x2xi32> -func.func @test_lower_vector_arm_neon_same_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { +func.func @vector_arm_neon_same_types(%lhs: vector<2x8xi8>, %rhs: vector<2x8xi8>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { %lhs_extsi = arith.extsi %lhs : vector<2x8xi8> to vector<2x8xi32> %rhs_extsi = arith.extsi %rhs : vector<2x8xi8> to vector<2x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32> @@ -33,17 +33,17 @@ func.func @test_lower_vector_arm_neon_same_types(%lhs: vector<2x8xi8>, %rhs: vec // ----- -// CHECK-LABEL: test_lower_vector_arm_neon_without_extsi +// CHECK-LABEL: vector_arm_neon_without_extsi // CHECK-SAME: %[[A0:.*]]: vector<2x8xi32>, %[[A1:.*]]: vector<2x8xi32>, %[[A2:.*]]: vector<2x2xi32> // CHECK-DAG: %[[D0:.*]] = vector.contract -func.func @test_lower_vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs: vector<2x8xi32>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { +func.func @vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs: vector<2x8xi32>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs, %rhs, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32> return %res : vector<2x2xi32> } // ----- -// CHECK-LABEL: test_lower_vector_arm_neon_unroll +// CHECK-LABEL: vector_arm_neon_unroll // CHECK-SAME: %[[VAL_0:.*]]: vector<4x8xi8>, %[[VAL_1:.*]]: vector<4x8xi8>, %[[VAL_2:.*]]: vector<4x4xi32> // CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<0> : vector<4x4xi32> // CHECK-DAG: %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8> @@ -84,7 +84,7 @@ func.func @test_lower_vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs: // CHECK-DAG: %[[VAL_39:.*]] = vector.insert_strided_slice %[[VAL_38]], %[[VAL_30]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32> // CHECK-DAG: return %[[VAL_39]] : vector<4x4xi32> // CHECK-DAG: } -func.func @test_lower_vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector<4x8xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> { +func.func @vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector<4x8xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> { %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32> %rhs_extsi = arith.extsi %rhs : vector<4x8xi8> to vector<4x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<4x8xi32> into vector<4x4xi32> @@ -93,7 +93,7 @@ func.func @test_lower_vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector< // ----- -// CHECK-LABEL: func.func @test_lower_vector_arm_neon_mixed_unroll( +// CHECK-LABEL: func.func @vector_arm_neon_mixed_unroll( // CHECK-SAME: %[[VAL_0:.*]]: vector<4x8xi8>, // CHECK-SAME: %[[VAL_1:.*]]: vector<2x8xi4>, // CHECK-SAME: %[[VAL_2:.*]]: vector<4x2xi32>) -> vector<4x2xi32> { @@ -117,7 +117,7 @@ func.func @test_lower_vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector< // CHECK-DAG: %[[VAL_20:.*]] = vector.insert_strided_slice %[[VAL_19]], %[[VAL_12]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x2xi32> // CHECK-DAG: return %[[VAL_20]] : vector<4x2xi32> // CHECK-DAG: } -func.func @test_lower_vector_arm_neon_mixed_unroll(%lhs: vector<4x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<4x2xi32>) -> vector<4x2xi32> { +func.func @vector_arm_neon_mixed_unroll(%lhs: vector<4x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<4x2xi32>) -> vector<4x2xi32> { %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32> %rhs_extsi = arith.extsi %rhs : vector<2x8xi4> to vector<2x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<2x8xi32> into vector<4x2xi32> @@ -126,9 +126,9 @@ func.func @test_lower_vector_arm_neon_mixed_unroll(%lhs: vector<4x8xi8>, %rhs: v // ----- -// CHECK-LABEL: func.func @test_lower_vector_arm_neon_unroll_incompatible_shape( +// CHECK-LABEL: func.func @vector_arm_neon_unroll_incompatible_shape( // CHECK-DAG: %[[result:.*]] = vector.contract -func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x12xi8>, %rhs: vector<4x12xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> { +func.func @vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x12xi8>, %rhs: vector<4x12xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> { %lhs_extsi = arith.extsi %lhs : vector<4x12xi8> to vector<4x12xi32> %rhs_extsi = arith.extsi %rhs : vector<4x12xi8> to vector<4x12xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<4x12xi32>, vector<4x12xi32> into vector<4x4xi32> @@ -137,7 +137,7 @@ func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x1 // ----- -// CHECK-LABEL: func.func @test_lower_vector_arm_neon_vecmat_unroll( +// CHECK-LABEL: func.func @vector_arm_neon_vecmat_unroll( // CHECK-SAME: %[[VAL_0:.*]]: vector<8xi8>, // CHECK-SAME: %[[VAL_1:.*]]: vector<8x8xi8>, // CHECK-SAME: %[[VAL_2:.*]]: vector<8xi32>) -> vector<8xi32> { @@ -190,7 +190,7 @@ func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x1 // CHECK: %[[VAL_49:.*]] = vector.insert_strided_slice %[[VAL_48]], %[[VAL_38]] {offsets = [6], strides = [1]} : vector<2xi32> into vector<8xi32> // CHECK: return %[[VAL_49]] : vector<8xi32> // CHECK: } -func.func @test_lower_vector_arm_neon_vecmat_unroll(%lhs: vector<8xi8>, %rhs: vector<8x8xi8>, %acc : vector<8xi32>) -> vector<8xi32> { +func.func @vector_arm_neon_vecmat_unroll(%lhs: vector<8xi8>, %rhs: vector<8x8xi8>, %acc : vector<8xi32>) -> vector<8xi32> { %lhs_extsi= arith.extsi %lhs : vector<8xi8> to vector<8xi32> %rhs_extsi = arith.extsi %rhs : vector<8x8xi8> to vector<8x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<8xi32>, vector<8x8xi32> into vector<8xi32> @@ -199,7 +199,7 @@ func.func @test_lower_vector_arm_neon_vecmat_unroll(%lhs: vector<8xi8>, %rhs: ve // ----- -// CHECK-LABEL: func.func @test_lower_vector_arm_neon_vecmat_unroll_leading_dim( +// CHECK-LABEL: func.func @vector_arm_neon_vecmat_unroll_leading_dim( // CHECK-SAME: %[[VAL_0:.*]]: vector<1x8xi8>, // CHECK-SAME: %[[VAL_1:.*]]: vector<8x8xi8>, // CHECK-SAME: %[[VAL_2:.*]]: vector<1x8xi32>) -> vector<1x8xi32> { @@ -252,7 +252,7 @@ func.func @test_lower_vector_arm_neon_vecmat_unroll(%lhs: vector<8xi8>, %rhs: ve // CHECK: %[[VAL_49:.*]] = vector.insert_strided_slice %[[VAL_48]], %[[VAL_38]] {offsets = [0, 6], strides = [1]} : vector<2xi32> into vector<1x8xi32> // CHECK: return %[[VAL_49]] : vector<1x8xi32> // CHECK: } -func.func @test_lower_vector_arm_neon_vecmat_unroll_leading_dim(%lhs: vector<1x8xi8>, %rhs: vector<8x8xi8>, %acc : vector<1x8xi32>) -> vector<1x8xi32> { +func.func @vector_arm_neon_vecmat_unroll_leading_dim(%lhs: vector<1x8xi8>, %rhs: vector<8x8xi8>, %acc : vector<1x8xi32>) -> vector<1x8xi32> { %lhs_extsi= arith.extsi %lhs : vector<1x8xi8> to vector<1x8xi32> %rhs_extsi = arith.extsi %rhs : vector<8x8xi8> to vector<8x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<1x8xi32>, vector<8x8xi32> into vector<1x8xi32> @@ -261,11 +261,96 @@ func.func @test_lower_vector_arm_neon_vecmat_unroll_leading_dim(%lhs: vector<1x8 // ----- -// CHECK-LABEL: func.func @test_lower_vector_arm_neon_matvec +// CHECK-LABEL: func.func @vector_arm_neon_matvec // CHECK-NOT: arm_neon.intr.smmla -func.func @test_lower_vector_arm_neon_matvec(%lhs: vector<8x8xi8>, %rhs: vector<8xi8>, %acc : vector<8xi32>) -> vector<8xi32> { +func.func @vector_arm_neon_matvec(%lhs: vector<8x8xi8>, %rhs: vector<8xi8>, %acc : vector<8xi32>) -> vector<8xi32> { %rhs_extsi= arith.extsi %rhs : vector<8xi8> to vector<8xi32> %lhs_extsi = arith.extsi %lhs : vector<8x8xi8> to vector<8x8xi32> %res = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<8x8xi32>, vector<8xi32> into vector<8xi32> return %res : vector<8xi32> } + + +// ----- + +// CHECK-LABEL: func.func @vector_arm_neon_k_unroll( +// CHECK-SAME: %[[VAL_0:.*]]: vector<2x16xi8>, +// CHECK-SAME: %[[VAL_1:.*]]: vector<2x16xi4>, +// CHECK-SAME: %[[VAL_2:.*]]: vector<2x2xi32>) -> vector<2x2xi32> { +// CHECK: %[[VAL_3:.*]] = arith.extsi %[[VAL_1]] : vector<2x16xi4> to vector<2x16xi8> +// CHECK: %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<2x16xi8> to vector<2x8xi8> +// CHECK: %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_3]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<2x16xi8> to vector<2x8xi8> +// CHECK: %[[VAL_6:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_7:.*]] = vector.shape_cast %[[VAL_5]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_8:.*]] = vector.shape_cast %[[VAL_2]] : vector<2x2xi32> to vector<4xi32> +// CHECK: %[[KACC_0:.*]] = arm_neon.intr.smmla %[[VAL_8]], %[[VAL_6]], %[[VAL_7]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_10:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 8], sizes = [2, 8], strides = [1, 1]} : vector<2x16xi8> to vector<2x8xi8> +// CHECK: %[[VAL_11:.*]] = vector.extract_strided_slice %[[VAL_3]] {offsets = [0, 8], sizes = [2, 8], strides = [1, 1]} : vector<2x16xi8> to vector<2x8xi8> +// CHECK: %[[VAL_12:.*]] = vector.shape_cast %[[VAL_10]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_13:.*]] = vector.shape_cast %[[VAL_11]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[KACC_1:.*]] = arm_neon.intr.smmla %[[KACC_0]], %[[VAL_12]], %[[VAL_13]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_15:.*]] = vector.shape_cast %[[KACC_1]] : vector<4xi32> to vector<2x2xi32> +// CHECK: return %[[VAL_15]] : vector<2x2xi32> +// CHECK: } +func.func @vector_arm_neon_k_unroll(%lhs: vector<2x16xi8>, %rhs: vector<2x16xi4>, %acc : vector<2x2xi32>) -> vector<2x2xi32> { + %lhs_extsi = arith.extsi %lhs : vector<2x16xi8> to vector<2x16xi32> + %rhs_extsi = arith.extsi %rhs : vector<2x16xi4> to vector<2x16xi32> + %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<2x16xi32>, vector<2x16xi32> into vector<2x2xi32> + return %res : vector<2x2xi32> +} + +// ----- + +// CHECK-LABEL: func.func @vector_arm_neon_k_unroll_vecmat( +// CHECK-SAME: %[[VAL_0:.*]]: vector<1x32xi8>, +// CHECK-SAME: %[[VAL_1:.*]]: vector<2x32xi4>, +// CHECK-SAME: %[[VAL_2:.*]]: vector<1x2xi32>) -> vector<1x2xi32> { +// CHECK: %[[VAL_3:.*]] = arith.constant dense<0> : vector<2x2xi32> +// CHECK: %[[VAL_4:.*]] = arith.constant dense<0> : vector<2x8xi8> +// CHECK: %[[VAL_5:.*]] = arith.constant dense<0> : vector<1x2xi32> +// CHECK: %[[VAL_6:.*]] = arith.extsi %[[VAL_1]] : vector<2x32xi4> to vector<2x32xi8> +// CHECK: %[[VAL_7:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [1, 8], strides = [1, 1]} : vector<1x32xi8> to vector<1x8xi8> +// CHECK: %[[VAL_8:.*]] = vector.extract_strided_slice %[[VAL_6]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<2x32xi8> to vector<2x8xi8> +// CHECK: %[[VAL_9:.*]] = vector.insert_strided_slice %[[VAL_7]], %[[VAL_4]] {offsets = [0, 0], strides = [1, 1]} : vector<1x8xi8> into vector<2x8xi8> +// CHECK: %[[VAL_10:.*]] = vector.insert_strided_slice %[[VAL_2]], %[[VAL_3]] {offsets = [0, 0], strides = [1, 1]} : vector<1x2xi32> into vector<2x2xi32> +// CHECK: %[[VAL_11:.*]] = vector.shape_cast %[[VAL_9]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_12:.*]] = vector.shape_cast %[[VAL_8]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_13:.*]] = vector.shape_cast %[[VAL_10]] : vector<2x2xi32> to vector<4xi32> +// CHECK: %[[KACC_0:.*]] = arm_neon.intr.smmla %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_15:.*]] = vector.shape_cast %[[KACC_0]] : vector<4xi32> to vector<2x2xi32> +// CHECK: %[[VAL_16:.*]] = vector.extract %[[VAL_15]][0] : vector<2xi32> from vector<2x2xi32> +// CHECK: %[[VAL_17:.*]] = vector.insert_strided_slice %[[VAL_16]], %[[VAL_5]] {offsets = [0, 0], strides = [1]} : vector<2xi32> into vector<1x2xi32> +// CHECK: %[[VAL_18:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 8], sizes = [1, 8], strides = [1, 1]} : vector<1x32xi8> to vector<1x8xi8> +// CHECK: %[[VAL_19:.*]] = vector.extract_strided_slice %[[VAL_6]] {offsets = [0, 8], sizes = [2, 8], strides = [1, 1]} : vector<2x32xi8> to vector<2x8xi8> +// CHECK: %[[VAL_20:.*]] = vector.insert_strided_slice %[[VAL_18]], %[[VAL_4]] {offsets = [0, 0], strides = [1, 1]} : vector<1x8xi8> into vector<2x8xi8> +// CHECK: %[[VAL_21:.*]] = vector.shape_cast %[[VAL_20]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_22:.*]] = vector.shape_cast %[[VAL_19]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[KACC_1:.*]] = arm_neon.intr.smmla %[[KACC_0]], %[[VAL_21]], %[[VAL_22]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_24:.*]] = vector.shape_cast %[[KACC_1]] : vector<4xi32> to vector<2x2xi32> +// CHECK: %[[VAL_25:.*]] = vector.extract %[[VAL_24]][0] : vector<2xi32> from vector<2x2xi32> +// CHECK: %[[VAL_26:.*]] = vector.insert_strided_slice %[[VAL_25]], %[[VAL_17]] {offsets = [0, 0], strides = [1]} : vector<2xi32> into vector<1x2xi32> +// CHECK: %[[VAL_27:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 16], sizes = [1, 8], strides = [1, 1]} : vector<1x32xi8> to vector<1x8xi8> +// CHECK: %[[VAL_28:.*]] = vector.extract_strided_slice %[[VAL_6]] {offsets = [0, 16], sizes = [2, 8], strides = [1, 1]} : vector<2x32xi8> to vector<2x8xi8> +// CHECK: %[[VAL_29:.*]] = vector.insert_strided_slice %[[VAL_27]], %[[VAL_4]] {offsets = [0, 0], strides = [1, 1]} : vector<1x8xi8> into vector<2x8xi8> +// CHECK: %[[VAL_30:.*]] = vector.shape_cast %[[VAL_29]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_31:.*]] = vector.shape_cast %[[VAL_28]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[KACC_2:.*]] = arm_neon.intr.smmla %[[KACC_1]], %[[VAL_30]], %[[VAL_31]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_33:.*]] = vector.shape_cast %[[KACC_2]] : vector<4xi32> to vector<2x2xi32> +// CHECK: %[[VAL_34:.*]] = vector.extract %[[VAL_33]][0] : vector<2xi32> from vector<2x2xi32> +// CHECK: %[[VAL_35:.*]] = vector.insert_strided_slice %[[VAL_34]], %[[VAL_26]] {offsets = [0, 0], strides = [1]} : vector<2xi32> into vector<1x2xi32> +// CHECK: %[[VAL_36:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 24], sizes = [1, 8], strides = [1, 1]} : vector<1x32xi8> to vector<1x8xi8> +// CHECK: %[[VAL_37:.*]] = vector.extract_strided_slice %[[VAL_6]] {offsets = [0, 24], sizes = [2, 8], strides = [1, 1]} : vector<2x32xi8> to vector<2x8xi8> +// CHECK: %[[VAL_38:.*]] = vector.insert_strided_slice %[[VAL_36]], %[[VAL_4]] {offsets = [0, 0], strides = [1, 1]} : vector<1x8xi8> into vector<2x8xi8> +// CHECK: %[[VAL_39:.*]] = vector.shape_cast %[[VAL_38]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[VAL_40:.*]] = vector.shape_cast %[[VAL_37]] : vector<2x8xi8> to vector<16xi8> +// CHECK: %[[KACC_3:.*]] = arm_neon.intr.smmla %[[KACC_2]], %[[VAL_39]], %[[VAL_40]] : vector<16xi8> to vector<4xi32> +// CHECK: %[[VAL_42:.*]] = vector.shape_cast %[[KACC_3]] : vector<4xi32> to vector<2x2xi32> +// CHECK: %[[VAL_43:.*]] = vector.extract %[[VAL_42]][0] : vector<2xi32> from vector<2x2xi32> +// CHECK: %[[VAL_44:.*]] = vector.insert_strided_slice %[[VAL_43]], %[[VAL_35]] {offsets = [0, 0], strides = [1]} : vector<2xi32> into vector<1x2xi32> +// CHECK: return %[[VAL_44]] : vector<1x2xi32> +func.func @vector_arm_neon_k_unroll_vecmat(%lhs: vector<1x32xi8>, %rhs: vector<2x32xi4>, %acc : vector<1x2xi32>) -> vector<1x2xi32> { + %lhs_extsi = arith.extsi %lhs : vector<1x32xi8> to vector<1x32xi32> + %rhs_extsi = arith.extsi %rhs : vector<2x32xi4> to vector<2x32xi32> + %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs_extsi, %rhs_extsi, %acc : vector<1x32xi32>, vector<2x32xi32> into vector<1x2xi32> + return %res : vector<1x2xi32> +} diff --git a/mlir/test/Dialect/EmitC/invalid_types.mlir b/mlir/test/Dialect/EmitC/invalid_types.mlir index f9d517bf689b95..0ad8d4eabe6b8b 100644 --- a/mlir/test/Dialect/EmitC/invalid_types.mlir +++ b/mlir/test/Dialect/EmitC/invalid_types.mlir @@ -97,3 +97,51 @@ func.func @illegal_float_type(%arg0: f80, %arg1: f80) { %mul = "emitc.mul" (%arg0, %arg1) : (f80, f80) -> f80 return } + +// ----- + +func.func @illegal_pointee_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got '!emitc.ptr'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> !emitc.ptr + return +} + +// ----- + +func.func @illegal_non_static_tensor_shape_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got 'tensor'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> tensor + return +} + +// ----- + +func.func @illegal_tensor_array_element_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got 'tensor>'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> tensor> + return +} + +// ----- + +func.func @illegal_tensor_integer_element_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got 'tensor<9xi11>'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> tensor<9xi11> + return +} + +// ----- + +func.func @illegal_tuple_array_element_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got 'tuple, f32>'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> tuple, f32> + return +} + +// ----- + +func.func @illegal_tuple_float_element_type() { + // expected-error @+1 {{'emitc.variable' op result #0 must be type supported by EmitC, but got 'tuple'}} + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> tuple + return +} diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 1f04f457068709..2f24dce4233e48 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -10,6 +10,58 @@ func.func @unknown_clause() { // ----- +func.func @not_wrapper() { + omp.distribute { + // expected-error@+1 {{op must take a loop wrapper role if nested inside of 'omp.distribute'}} + omp.parallel { + %0 = arith.constant 0 : i32 + omp.terminator + } + omp.terminator + } + + return +} + +// ----- + +func.func @invalid_nested_wrapper(%lb : index, %ub : index, %step : index) { + omp.distribute { + // expected-error@+1 {{only supported nested wrapper is 'omp.wsloop'}} + omp.parallel { + omp.simd { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + + return +} + +// ----- + +func.func @no_nested_wrapper(%lb : index, %ub : index, %step : index) { + omp.distribute { + // expected-error@+1 {{op must not wrap an 'omp.loop_nest' directly}} + omp.parallel { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } + omp.terminator + } + omp.terminator + } + + return +} + +// ----- + func.func @if_once(%n : i1) { // expected-error@+1 {{`if` clause can appear at most once in the expansion of the oilist directive}} omp.parallel if(%n : i1) if(%n : i1) { diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index e2ca12afc14bd6..c10fc88211c367 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -51,7 +51,7 @@ func.func @omp_terminator() -> () { omp.terminator } -func.func @omp_parallel(%data_var : memref, %if_cond : i1, %num_threads : i32) -> () { +func.func @omp_parallel(%data_var : memref, %if_cond : i1, %num_threads : i32, %idx : index) -> () { // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref -> %{{.*}} : memref) "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({ @@ -85,6 +85,24 @@ func.func @omp_parallel(%data_var : memref, %if_cond : i1, %num_threads : i omp.terminator }) {operandSegmentSizes = array} : (memref, memref) -> () + // CHECK: omp.distribute + omp.distribute { + // CHECK-NEXT: omp.parallel + omp.parallel { + // CHECK-NEXT: omp.wsloop + // TODO Remove induction variables from omp.wsloop. + omp.wsloop for (%iv) : index = (%idx) to (%idx) step (%idx) { + // CHECK-NEXT: omp.loop_nest + omp.loop_nest (%iv2) : index = (%idx) to (%idx) step (%idx) { + omp.yield + } + omp.terminator + } + omp.terminator + } + omp.terminator + } + return } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index ac365c9d297e88..751c57eacd7ae5 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -2403,6 +2403,53 @@ func.func @dim_of_reshape_undominated(%arg0: tensor<*xf32>, %arg1: tensor +func.func @reshape_fold_2d(%arg0 : tensor) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %d0 = tensor.dim %arg0, %c0 : tensor + %d1 = tensor.dim %arg0, %c1 : tensor + %ds = tensor.from_elements %d0, %d1 : tensor<2xindex> + %reshape = tensor.reshape %arg0(%ds) : (tensor, tensor<2xindex>) -> tensor + // CHECK: return %[[ARG0]] + return %reshape : tensor +} + +// ----- + +// CHECK-LABEL: @reshape_nofold_2d +// CHECK-SAME: %[[ARG0:.+]]: tensor +func.func @reshape_nofold_2d(%arg0 : tensor) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %d0 = tensor.dim %arg0, %c0 : tensor + %d1 = tensor.dim %arg0, %c1 : tensor + %ds = tensor.from_elements %d1, %d0 : tensor<2xindex> + // CHECK: tensor.reshape + %reshape = tensor.reshape %arg0(%ds) : (tensor, tensor<2xindex>) -> tensor + return %reshape : tensor +} + + +// ----- + +// CHECK-LABEL: @reshape_fold_3d_cst +// CHECK-SAME: %[[ARG0:.+]]: tensor<5x?x?xi32> +func.func @reshape_fold_3d_cst(%arg0 : tensor<5x?x?xi32>) -> tensor<5x?x?xi32> { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %d0 = arith.constant 5 : index + %d1 = tensor.dim %arg0, %c1 : tensor<5x?x?xi32> + %d2 = tensor.dim %arg0, %c2 : tensor<5x?x?xi32> + %ds = tensor.from_elements %d0, %d1, %d2 : tensor<3xindex> + %reshape = tensor.reshape %arg0(%ds) : (tensor<5x?x?xi32>, tensor<3xindex>) -> tensor<5x?x?xi32> + // CHECK: return %[[ARG0]] + return %reshape : tensor<5x?x?xi32> +} + +// ----- + // Test case: This test fails to fold because the index of tensor.dim is out_of_bounds // CHECK-LABEL: func @dim_out_of_bounds( // CHECK: %[[IDX:.*]] = index.constant 28 diff --git a/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir new file mode 100644 index 00000000000000..d32954127fce61 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir @@ -0,0 +1,20 @@ +// RUN: mlir-opt -xegpu-fold-alias-ops -split-input-file %s | FileCheck %s + +func.func @fold_subview_with_xegpu_create_nd_tdesc(%arg0 : memref<256x256xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) ->(!xegpu.tensor_desc<8x16xf32>) { + %subview = memref.subview %arg0[%arg1, %arg2] [32, 32] [1, 1] : + memref<256x256xf32> to memref<32x32xf32, strided<[256, 1], offset: ?>> + %0 = xegpu.create_nd_tdesc %subview[%arg3, %arg4] : + memref<32x32xf32, strided<[256, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> + return %0 : !xegpu.tensor_desc<8x16xf32> +} + +// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)> +// CHECK: func @fold_subview_with_xegpu_create_nd_tdesc +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<256x256xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: index +// CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG3]]] +// CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP]]()[%[[ARG2]], %[[ARG4]]] +// CHECK: xegpu.create_nd_tdesc %[[ARG0]][%[[IDX0]], %[[IDX1]]] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32> diff --git a/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir b/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir index d3bc91339d164b..5df32555000ad0 100644 --- a/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir +++ b/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir @@ -32,6 +32,18 @@ func.func @no_layout_builtin() { // CHECK: preferred = 8 // CHECK: size = 8 "test.data_layout_query"() : () -> index + // CHECK: alignment = 16 + // CHECK: bitsize = 128 + // CHECK: index = 0 + // CHECK: preferred = 16 + // CHECK: size = 16 + "test.data_layout_query"() : () -> vector<4xi32> + // CHECK: alignment = 16 + // CHECK: bitsize = {minimal_size = 128 : index, scalable} + // CHECK: index = 0 + // CHECK: preferred = 16 + // CHECK: size = {minimal_size = 16 : index, scalable} + "test.data_layout_query"() : () -> vector<[4]xi32> return } diff --git a/mlir/test/Target/LLVMIR/Import/ignore-composite-type-elements.ll b/mlir/test/Target/LLVMIR/Import/ignore-composite-type-elements.ll new file mode 100644 index 00000000000000..d92d07085610f3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/ignore-composite-type-elements.ll @@ -0,0 +1,47 @@ +; RUN: mlir-translate -import-llvm -mlir-print-debuginfo -split-input-file -drop-di-composite-type-elements -split-input-file %s | FileCheck %s + +; Verifies that the according flag avoids the conversion of the elements of the +; DICompositeType. + +; CHECK-NOT: di_derive_type +; CHECK: #{{.+}} = #llvm.di_composite_type +; CHECK-NOT: di_derive_type + +define void @composite_type() !dbg !3 { + ret void +} + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!0} +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2) +!2 = !DIFile(filename: "debug-info.ll", directory: "/") +!3 = distinct !DISubprogram(name: "composite_type", scope: !2, file: !2, spFlags: DISPFlagDefinition, unit: !1, type: !4) +!4 = !DISubroutineType(types: !5) +!5 = !{!6} +!6 = !DICompositeType(tag: DW_TAG_class_type, name: "class", elements: !7) +!7 = !{!9} +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, flags: DIFlagArtificial | DIFlagObjectPointer) +!9 = !DIDerivedType(tag: DW_TAG_member, name: "call_field", file: !2, baseType: !8) + +; // ----- + +; CHECK: #{{.+}} = #llvm.di_composite_type + +define void @composite_type() !dbg !3 { + ret void +} + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!0} +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2) +!2 = !DIFile(filename: "debug-info.ll", directory: "/") +!3 = distinct !DISubprogram(name: "composite_type", scope: !2, file: !2, spFlags: DISPFlagDefinition, unit: !1, type: !4) +!4 = !DISubroutineType(types: !5) +!5 = !{!7} +!6 = !DIBasicType(name: "int") +!7 = !DICompositeType(tag: DW_TAG_array_type, name: "vector", flags: DIFlagVector, elements: !8, baseType: !6) +!8 = !{!9} +!9 = !DISubrange(count: 4) diff --git a/mlir/test/lib/Conversion/MathToVCIX/TestMathToVCIXConversion.cpp b/mlir/test/lib/Conversion/MathToVCIX/TestMathToVCIXConversion.cpp index c8bee817213d8d..e17fe12b9088bd 100644 --- a/mlir/test/lib/Conversion/MathToVCIX/TestMathToVCIXConversion.cpp +++ b/mlir/test/lib/Conversion/MathToVCIX/TestMathToVCIXConversion.cpp @@ -25,7 +25,7 @@ namespace { /// according to LLVM's encoding: /// https://lists.llvm.org/pipermail/llvm-dev/2020-October/145850.html static std::pair legalizeVectorType(const Type &type) { - VectorType vt = type.cast(); + VectorType vt = cast(type); // To simplify test pass, avoid multi-dimensional vectors. if (!vt || vt.getRank() != 1) return {0, nullptr}; @@ -39,7 +39,7 @@ static std::pair legalizeVectorType(const Type &type) { sew = 32; else if (eltTy.isF64()) sew = 64; - else if (auto intTy = eltTy.dyn_cast()) + else if (auto intTy = dyn_cast(eltTy)) sew = intTy.getWidth(); else return {0, nullptr}; diff --git a/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp b/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp index a4464bba7e8584..84f45b31603192 100644 --- a/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp +++ b/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp @@ -46,9 +46,22 @@ struct TestDataLayoutQuery Attribute programMemorySpace = layout.getProgramMemorySpace(); Attribute globalMemorySpace = layout.getGlobalMemorySpace(); uint64_t stackAlignment = layout.getStackAlignment(); + + auto convertTypeSizeToAttr = [&](llvm::TypeSize typeSize) -> Attribute { + if (!typeSize.isScalable()) + return builder.getIndexAttr(typeSize); + + return builder.getDictionaryAttr({ + builder.getNamedAttr("scalable", builder.getUnitAttr()), + builder.getNamedAttr( + "minimal_size", + builder.getIndexAttr(typeSize.getKnownMinValue())), + }); + }; + op->setAttrs( - {builder.getNamedAttr("size", builder.getIndexAttr(size)), - builder.getNamedAttr("bitsize", builder.getIndexAttr(bitsize)), + {builder.getNamedAttr("size", convertTypeSizeToAttr(size)), + builder.getNamedAttr("bitsize", convertTypeSizeToAttr(bitsize)), builder.getNamedAttr("alignment", builder.getIndexAttr(alignment)), builder.getNamedAttr("preferred", builder.getIndexAttr(preferred)), builder.getNamedAttr("index", builder.getIndexAttr(index)), diff --git a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp index 9b3082a819224f..5e3918f79d1844 100644 --- a/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp +++ b/mlir/test/lib/Dialect/Mesh/TestReshardingSpmdization.cpp @@ -67,12 +67,11 @@ struct TestMeshReshardingRewritePattern : OpRewritePattern { ImplicitLocOpBuilder builder(op->getLoc(), rewriter); ShapedType sourceShardShape = shardShapedType(op.getResult().getType(), mesh, op.getShard()); - TypedValue sourceShard = + TypedValue sourceShard = cast>( builder .create(sourceShardShape, op.getOperand()) - ->getResult(0) - .cast>(); + ->getResult(0)); TypedValue targetShard = reshard(builder, mesh, op, targetShardOp, sourceShard); Value newTargetUnsharded = diff --git a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp index 2dd99c67c1439b..fa093cafcb0dc3 100644 --- a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp +++ b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp @@ -61,7 +61,7 @@ LogicalResult TestDialectLLVMIRTranslationInterface::amendOperation( } bool createSymbol = false; - if (auto boolAttr = attr.dyn_cast()) + if (auto boolAttr = dyn_cast(attr)) createSymbol = boolAttr.getValue(); if (createSymbol) { diff --git a/mlir/test/lib/IR/TestAffineWalk.cpp b/mlir/test/lib/IR/TestAffineWalk.cpp index 8361b48ce42857..e8b836888b459f 100644 --- a/mlir/test/lib/IR/TestAffineWalk.cpp +++ b/mlir/test/lib/IR/TestAffineWalk.cpp @@ -44,7 +44,7 @@ void TestAffineWalk::runOnOperation() { // Test whether the walk is being correctly interrupted. m.walk([](Operation *op) { for (NamedAttribute attr : op->getAttrs()) { - auto mapAttr = attr.getValue().dyn_cast(); + auto mapAttr = dyn_cast(attr.getValue()); if (!mapAttr) return; checkMod(mapAttr.getAffineMap(), op->getLoc()); diff --git a/mlir/test/lib/IR/TestBuiltinAttributeInterfaces.cpp b/mlir/test/lib/IR/TestBuiltinAttributeInterfaces.cpp index 71ed30bfbe34cd..93c4bcfe1424e9 100644 --- a/mlir/test/lib/IR/TestBuiltinAttributeInterfaces.cpp +++ b/mlir/test/lib/IR/TestBuiltinAttributeInterfaces.cpp @@ -51,7 +51,7 @@ struct TestElementsAttrInterface InFlightDiagnostic diag = op->emitError() << "Test iterating `" << type << "`: "; - if (!attr.getElementType().isa()) { + if (!isa(attr.getElementType())) { diag << "expected element type to be an integer type"; return; } diff --git a/mlir/test/lib/Rewrite/TestPDLByteCode.cpp b/mlir/test/lib/Rewrite/TestPDLByteCode.cpp index 56af3c15b905f1..77aa30f847dcd0 100644 --- a/mlir/test/lib/Rewrite/TestPDLByteCode.cpp +++ b/mlir/test/lib/Rewrite/TestPDLByteCode.cpp @@ -61,7 +61,7 @@ static LogicalResult customTypeRangeResultConstraint(PatternRewriter &rewriter, PDLResultList &results, ArrayRef args) { auto *op = args[0].cast(); - int numTypes = args[1].cast().cast().getInt(); + int numTypes = cast(args[1].cast()).getInt(); if (op->getName().getStringRef() == "test.success_op") { SmallVector types; diff --git a/mlir/test/python/dialects/async_dialect.py b/mlir/test/python/dialects/async_dialect.py index f6181cc76118ed..13e3c42e57c21e 100644 --- a/mlir/test/python/dialects/async_dialect.py +++ b/mlir/test/python/dialects/async_dialect.py @@ -1,7 +1,8 @@ # RUN: %PYTHON %s | FileCheck %s from mlir.ir import * -import mlir.dialects.async_dialect +from mlir.dialects import arith +import mlir.dialects.async_dialect as async_dialect import mlir.dialects.async_dialect.passes from mlir.passmanager import * @@ -11,6 +12,19 @@ def run(f): f() +# CHECK-LABEL: TEST: testCreateGroupOp +@run +def testCreateGroupOp(): + with Context() as ctx, Location.unknown(): + module = Module.create() + with InsertionPoint(module.body): + i32 = IntegerType.get_signless(32) + group_size = arith.ConstantOp(i32, 4) + async_dialect.create_group(group_size) + # CHECK: %0 = "arith.constant"() <{value = 4 : i32}> : () -> i32 + # CHECK: %1 = "async.create_group"(%0) : (i32) -> !async.group + print(module) + def testAsyncPass(): with Context() as context: PassManager.parse("any(async-to-async-runtime)") diff --git a/openmp/runtime/test/worksharing/for/collapse_test.inc b/openmp/runtime/test/worksharing/for/collapse_test.inc index de0e7e4e57f30d..3075bd04e958fc 100644 --- a/openmp/runtime/test/worksharing/for/collapse_test.inc +++ b/openmp/runtime/test/worksharing/for/collapse_test.inc @@ -18,9 +18,9 @@ #define MAX_THREADS 256 #if defined VERBOSE -#define PRINTF printf +#define PRINTF(...) printf(__VA_ARGS__) #else -#define PRINTF +#define PRINTF(...) #endif LOOP_TYPE0 iLB, iUB; @@ -106,12 +106,21 @@ int test() { unsigned scalarCount = 0; unsigned uselessThreadsOpenMP = 0; unsigned usefulThreadsOpenMP = 0; - unsigned chunkSizesOpenmp[MAX_THREADS] = {0}; - unsigned num_threads = omp_get_max_threads(); + // Use half of the available threads/logical processors. + unsigned num_threads = omp_get_max_threads() / 2; + + // Make sure num_threads is not 0 after the division in case + // omp_get_max_threads() returns 1. + if (num_threads == 0) + num_threads = 1; + if (num_threads > MAX_THREADS) num_threads = MAX_THREADS; - omp_set_num_threads(num_threads); + + unsigned long *chunkSizesOpenmp = + (unsigned long *)malloc(sizeof(unsigned long) * num_threads); + memset(chunkSizesOpenmp, 0, sizeof(unsigned long) * num_threads); // count iterations and allocate space LOOP { ++trueCount; } @@ -129,10 +138,10 @@ int test() { // perform and record OpenMP iterations and thread use #pragma omp parallel num_threads(num_threads) { + unsigned gtid = omp_get_thread_num(); #pragma omp for collapse(3) private(i, j, k) LOOP { unsigned count; - unsigned gtid = omp_get_thread_num(); #pragma omp atomic update ++chunkSizesOpenmp[gtid]; #pragma omp atomic capture @@ -197,5 +206,6 @@ int test() { // clean up space FreeSpace(openmpSpace); FreeSpace(scalarSpace); + free(chunkSizesOpenmp); return pass; } diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index ef4574f7c144bb..6c732b8f134901 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3687,6 +3687,45 @@ cc_library( ], ) +gentbl_cc_library( + name = "XeGPUPassIncGen", + tbl_outs = [ + ( + [ + "-gen-pass-decls", + "-name=XeGPU", + ], + "include/mlir/Dialect/XeGPU/Transforms/Passes.h.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/XeGPU/Transforms/Passes.td", + deps = [":PassBaseTdFiles"], +) + +cc_library( + name = "XeGPUTransforms", + srcs = glob([ + "lib/Dialect/XeGPU/Transforms/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Dialect/XeGPU/Transforms/*.h", + ]), + includes = ["include"], + deps = [ + ":AffineUtils", + ":IR", + ":MemRefDialect", + ":Pass", + ":SideEffectInterfaces", + ":Support", + ":TransformUtils", + ":XeGPUDialect", + ":XeGPUPassIncGen", + "//llvm:Support", + ], +) + td_library( name = "FuncTdFiles", srcs = [ @@ -9229,6 +9268,7 @@ cc_library( ":X86VectorDialect", ":X86VectorTransforms", ":XeGPUDialect", + ":XeGPUTransforms", ":config", ], ) diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel index d6b0832f4c1a9b..3cc672476014f7 100644 --- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel @@ -331,6 +331,53 @@ filegroup( ], ) +##---------------------------------------------------------------------------## +# Async dialect. +##---------------------------------------------------------------------------## + +gentbl_filegroup( + name = "AsyncOpsPyGen", + tbl_outs = [ + ( + [ + "-gen-python-enum-bindings", + "-bind-dialect=async", + ], + "mlir/dialects/_async_enum_gen.py", + ), + ( + [ + "-gen-python-op-bindings", + "-bind-dialect=async", + ], + "mlir/dialects/_async_ops_gen.py", + ), + ], + tblgen = "//mlir:mlir-tblgen", + td_file = "mlir/dialects/AsyncOps.td", + deps = [ + "//mlir:AsyncOpsTdFiles", + "//mlir:OpBaseTdFiles", + ], +) + +filegroup( + name = "AsyncOpsPyFiles", + srcs = [ + ":AsyncOpsPyGen", + ], +) + +filegroup( + name = "AsyncOpsPackagePyFiles", + srcs = glob(["mlir/dialects/async_dialect/*.py"]), +) + +filegroup( + name = "AsyncOpsPackagePassesPyFiles", + srcs = glob(["mlir/dialects/async_dialect/passes/*.py"]), +) + ##---------------------------------------------------------------------------## # Arith dialect. ##---------------------------------------------------------------------------##