diff --git a/.gitignore b/.gitignore index bb362782..d25acd55 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,5 @@ # vim temp files .*.swp -src/build/ - # python compiled files *.pyc - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7d90f28a..33a91ee2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -266,7 +266,7 @@ if( BUILD_TEST ) endif( ) # This will define OPENCL_FOUND -find_package( OpenCL ) +find_package( OpenCL ${OPENCL_VERSION} ) # Find Boost on the system, and configure the type of boost build we want set( Boost_USE_MULTITHREADED ON ) @@ -288,7 +288,7 @@ endif() # Turn on maximum compiler verbosity if(CMAKE_COMPILER_IS_GNUCXX) - add_definitions(-pedantic -Wall -Wextra + add_definitions(# -pedantic -Wall -Wextra -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500 ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake index 746fbe61..9810dd29 100644 --- a/src/FindOpenCL.cmake +++ b/src/FindOpenCL.cmake @@ -1,5 +1,5 @@ # ######################################################################## -# Copyright 2013 Advanced Micro Devices, Inc. +# Copyright 2015 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ # limitations under the License. # ######################################################################## - # Locate an OpenCL implementation. # Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) # @@ -46,60 +45,122 @@ # target_link_libraries(foo ${OPENCL_LIBRARIES}) # #----------------------- +include( CheckSymbolExists ) +include( CMakePushCheckState ) + +if( DEFINED OPENCL_ROOT OR DEFINED ENV{OPENCL_ROOT}) + message( STATUS "Defined OPENCL_ROOT: ${OPENCL_ROOT}, ENV{OPENCL_ROOT}: $ENV{OPENCL_ROOT}" ) +endif( ) find_path(OPENCL_INCLUDE_DIRS - NAMES OpenCL/cl.h CL/cl.h - HINTS - ${OPENCL_ROOT}/include - $ENV{AMDAPPSDKROOT}/include - $ENV{CUDA_PATH}/include - PATHS - /usr/include - /usr/local/include - /usr/local/cuda/include - /opt/cuda/include - DOC "OpenCL header file path" + NAMES OpenCL/cl.h CL/cl.h + HINTS + ${OPENCL_ROOT}/include + $ENV{OPENCL_ROOT}/include + $ENV{AMDAPPSDKROOT}/include + $ENV{CUDA_PATH}/include + PATHS + /usr/include + /usr/local/include + /usr/local/cuda/include + DOC "OpenCL header file path" ) mark_as_advanced( OPENCL_INCLUDE_DIRS ) +message( STATUS "OPENCL_INCLUDE_DIRS: ${OPENCL_INCLUDE_DIRS}" ) + +set( OpenCL_VERSION "0.0" ) + +cmake_push_check_state( RESET ) +set( CMAKE_REQUIRED_INCLUDES "${OPENCL_INCLUDE_DIRS}" ) + +# Bug in check_symbol_exists prevents us from specifying a list of files, so we loop +# Only 1 of these files will exist on a system, so the other file will not clobber the output variable +if( APPLE ) + set( CL_HEADER_FILE "OpenCL/cl.h" ) +else( ) + set( CL_HEADER_FILE "CL/cl.h" ) +endif( ) + +check_symbol_exists( CL_VERSION_2_0 ${CL_HEADER_FILE} HAVE_CL_2_0 ) +check_symbol_exists( CL_VERSION_1_2 ${CL_HEADER_FILE} HAVE_CL_1_2 ) +check_symbol_exists( CL_VERSION_1_1 ${CL_HEADER_FILE} HAVE_CL_1_1 ) +# message( STATUS "HAVE_CL_2_0: ${HAVE_CL_2_0}" ) +# message( STATUS "HAVE_CL_1_2: ${HAVE_CL_1_2}" ) +# message( STATUS "HAVE_CL_1_1: ${HAVE_CL_1_1}" ) + +# set OpenCL_VERSION to the highest detected version +if( HAVE_CL_2_0 ) + set( OpenCL_VERSION "2.0" ) +elseif( HAVE_CL_1_2 ) + set( OpenCL_VERSION "1.2" ) +elseif( HAVE_CL_1_1 ) + set( OpenCL_VERSION "1.1" ) +endif( ) + +cmake_pop_check_state( ) # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) +if( LIB64 ) + message( STATUS "FindOpenCL searching for 64-bit libraries" ) +else( ) + message( STATUS "FindOpenCL searching for 32-bit libraries" ) +endif( ) if( LIB64 ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib - DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86_64 x64 x86_64/sdk - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86_64 x64 x86_64/sdk + PATHS + /usr/lib + /usr/local/cuda/lib + ) else( ) - find_library( OPENCL_LIBRARIES - NAMES OpenCL - HINTS - ${OPENCL_ROOT}/lib - $ENV{AMDAPPSDKROOT}/lib - $ENV{CUDA_PATH}/lib - DOC "OpenCL dynamic library path" - PATH_SUFFIXES x86 Win32 - - PATHS - /usr/lib - /usr/local/cuda/lib - /opt/cuda/lib - ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86 Win32 + PATHS + /usr/lib + /usr/local/cuda/lib + ) endif( ) mark_as_advanced( OPENCL_LIBRARIES ) +# message( STATUS "OpenCL_FIND_VERSION: ${OpenCL_FIND_VERSION}" ) +if( OpenCL_VERSION VERSION_LESS OpenCL_FIND_VERSION ) + message( FATAL_ERROR "Requested OpenCL version: ${OpenCL_FIND_VERSION}, Found OpenCL version: ${OpenCL_VERSION}" ) +endif( ) + +# If we asked for OpenCL 1.2, and we found a version installed greater than that, pass the 'use deprecated' flag +if( (OpenCL_FIND_VERSION VERSION_LESS "2.0") AND (OpenCL_VERSION VERSION_GREATER OpenCL_FIND_VERSION) ) + add_definitions( -DCL_USE_DEPRECATED_OPENCL_2_0_APIS ) + + # If we asked for OpenCL 1.1, and we found a version installed greater than that, pass the 'use deprecated' flag + if( (OpenCL_FIND_VERSION VERSION_LESS "1.2") AND (OpenCL_VERSION VERSION_GREATER OpenCL_FIND_VERSION) ) + add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS ) + endif( ) +endif( ) + include( FindPackageHandleStandardArgs ) -FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL + REQUIRED_VARS OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS + VERSION_VAR OpenCL_VERSION + ) if( NOT OPENCL_FOUND ) message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +else( ) + message(STATUS "FindOpenCL ${OPENCL_LIBRARIES}, ${OPENCL_INCLUDE_DIRS}") endif() diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt index 1925b6c2..d94b6656 100644 --- a/src/library/CMakeLists.txt +++ b/src/library/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -99,7 +99,7 @@ set_property( CACHE AUTOGEMM_ARCHITECTURE PROPERTY STRINGS "Hawaii" "Fiji" ) # opencl compiler version #set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." ) #set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 ) -#message( STATUS "AutoGemm PreCompiler will use OpenCL ${PRECOMPILE_GEMM_OPENCL_VERSION} compiler." ) +#message( STATUS "AutoGemm PreCompiler will use OpenCL ${PRECOMPILE_GEMM_OPENCL_VERSION} compiler." ) # PreCompile precision selected? set( PRECOMPILE_GEMM_PRECISION_SELECTED OFF) @@ -554,8 +554,8 @@ set(SRC_CL_TEMPLATES_GEN dtrsm_gpu192.clHawaii_64.bin.cl dgemm_hawaiiChannelConfilct.clHawaii_64.bin.cl dgemm_hawaiiSplitKernel.clHawaii_64.bin.cl - sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl - sgemm_hawaiiSplitKernel.clBonaire_64.bin.cl + sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl + sgemm_hawaiiSplitKernel.clBonaire_64.bin.cl dgemm_hawai.clTahiti_64.bin.cl dtrsm_gpu.clTahiti_64.bin.cl dgemm_gcn_SmallMatrices.clHawaii_64.bin.cl @@ -628,7 +628,7 @@ set(CLBLAS_SOURCES ) set(GLOBAL_HEADERS ${clBLAS_SOURCE_DIR}/clBLAS.h - ${clBLAS_SOURCE_DIR}/clBLAS-complex.h + ${clBLAS_SOURCE_DIR}/clBLAS-complex.h ${clBLAS_SOURCE_DIR}/include/clkern.h ${clBLAS_SOURCE_DIR}/include/cltypes.h ${clBLAS_SOURCE_DIR}/include/dblock_kgen.h @@ -673,7 +673,7 @@ if( BLAS_DUMP_CLBLAS_KERNELS ) add_definitions( -DDUMP_CLBLAS_KERNELS ) endif() -option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" OFF ) +option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" ON ) if( BLAS_KEEP_KERNEL_SOURCES ) add_definitions( -DKEEP_CLBLAS_KERNEL_SOURCES ) endif() @@ -723,7 +723,7 @@ add_custom_command(TARGET OCLBinaryGenerator_GEN PRE_BUILD COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS} -P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake" - ) + ) add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator ) endif() @@ -759,7 +759,7 @@ message(STATUS "OPENCL_VERSION = ${OPENCL_VERSION}") # list(GET OPENCL_FLAGS_REPLACED 1 OPENCL_FLAGS_REPLACED_1)#flags for TAHITI # list(GET OPENCL_FLAGS_REPLACED 3 OPENCL_FLAGS_REPLACED_3)#flags for HAWAII 1 # list(GET OPENCL_FLAGS_REPLACED 5 OPENCL_FLAGS_REPLACED_5)#flags for HAWAII 2 -# list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE +# list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE # else() # MESSAGE(STATUS "flags.txt not found. will use the default flags.") # set (LOAD_CL_FLAGS FALSE) @@ -787,19 +787,19 @@ if (LOAD_CL_FLAGS) add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD - COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" - -DLOAD_CL_FLAGS=${LOAD_CL_FLAGS} -DTAHITI_FLAG=${OPENCL_FLAGS_REPLACED_1} -DHAWAII1_FLAG=${OPENCL_FLAGS_REPLACED_3} -DHAWAII2_FLAG=${OPENCL_FLAGS_REPLACED_5} -DBONAIRE_FLAG=${OPENCL_FLAGS_REPLACED_7} - -DENV_PATH=${ENV_PATH} -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} + COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" + -DLOAD_CL_FLAGS=${LOAD_CL_FLAGS} -DTAHITI_FLAG=${OPENCL_FLAGS_REPLACED_1} -DHAWAII1_FLAG=${OPENCL_FLAGS_REPLACED_3} -DHAWAII2_FLAG=${OPENCL_FLAGS_REPLACED_5} -DBONAIRE_FLAG=${OPENCL_FLAGS_REPLACED_7} + -DENV_PATH=${ENV_PATH} -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" - ) + ) add_dependencies( GEN_CLBIN bingen ) else() add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD - COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" - -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} + COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" + -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" ) diff --git a/src/library/blas/AutoGemm/Includes.py b/src/library/blas/AutoGemm/Includes.py index 3c8435fa..0f616956 100644 --- a/src/library/blas/AutoGemm/Includes.py +++ b/src/library/blas/AutoGemm/Includes.py @@ -113,7 +113,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getRowName() @@ -123,7 +123,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getColName() @@ -133,7 +133,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getCornerName() @@ -143,7 +143,7 @@ def addKernel(self, kernel): self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" - self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName + # self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" self.incFile.write( self.incStr ) diff --git a/src/library/blas/AutoGemm/KernelOpenCL.py b/src/library/blas/AutoGemm/KernelOpenCL.py index d7835d56..87a56761 100644 --- a/src/library/blas/AutoGemm/KernelOpenCL.py +++ b/src/library/blas/AutoGemm/KernelOpenCL.py @@ -482,7 +482,7 @@ def writeOpenCLKernelToFile(kernel): kernelFile.write("\";\n") kernelFile.write("\n") kernelFile.write("#else\n") - kernelFile.write("#pragma message(\"AutoGemmKernelSources.cpp: %s was overriden by user kernel.\")\n" % kernel.getName() ) + # kernelFile.write("#pragma message(\"AutoGemmKernelSources.cpp: %s was overriden by user kernel.\")\n" % kernel.getName() ) kernelFile.write("#endif\n") kernelFile.close() diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp index 04c07e8a..4eb654d2 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NN_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp index 05417daa..0fbde4a5 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NN_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp index ffe879af..d35d8140 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NT_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NT_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp index 5af48fcb..e13eda7f 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_NT_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_NT_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp index 1bed066f..e9710aec 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_TN_B0_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_TN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp index 927952b2..43429334 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H -#pragma message("AutoGemm's dgemm_Col_TN_B1_MX048_NX048_KX08_src overriden by user.") +// #pragma message("AutoGemm's dgemm_Col_TN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp index b8ba4e85..033f9269 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp index be06d446..99813f33 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp index c1f92569..ef8a648b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp index cc90ff90..c666ed5c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp index b2f8306f..9c0eb191 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) @@ -79,7 +79,7 @@ __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16 ( __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idxT*33+idyT; - + barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp index 80aeceaa..44b5acfb 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp index 26e354b5..2bdefd7c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NN_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp index d513c81b..fec4f08c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp index 11558a03..e0437cf7 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp index a8d0fec1..7f66ea07 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp index f26ddece..1e8e76a9 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp index a01958f1..7ca44c5c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp index 1a2ca972..1c2974a8 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp index ae477cbe..ebc42b61 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp index 3d39977e..b0213c93 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp @@ -6,7 +6,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp index d84d4ecd..e22d616b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp index 7e4401db..9a014e05 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX096_NX096_KX16_src overriden by user.") #include "UserGemmKernelSourceIncludes.h" diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp index 4c5ceb4d..ecc4d5f6 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_NT_B1_MX128_NX128_KX16_src (if exists) overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_NT_B1_MX128_NX128_KX16_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" @@ -53,7 +53,7 @@ const char * const sgemm_Col_NT_B1_MX128_NX128_KX16_src = STRINGIFY( rC[4][0] = mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0] = mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[6][0] = mad(rA[0][6],rB[0][0],rC[6][0]); \ - rC[7][0] = mad(rA[0][7],rB[0][0],rC[7][0]); \ + rC[7][0] = mad(rA[0][7],rB[0][0],rC[7][0]); \ rC[0][1] = mad(rA[0][0], rB[0][1], rC[0][1]); \ rC[1][1] = mad(rA[0][1], rB[0][1], rC[1][1]); \ rC[2][1] = mad(rA[0][2], rB[0][1], rC[2][1]); \ @@ -284,7 +284,7 @@ uint offsetC) C[80 * ldc] = alpha*rC[7][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[7][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[7][7] + beta*C[112 * ldc]; - + } ); #endif diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp index 5722f9ee..cf1f406b 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp index fd80cb99..2dfd586f 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp index 48323fc3..ccf23bd7 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B0_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp index a41a09ef..00e3e661 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp index 1b435748..5f2ed47c 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp index a678e204..ae198149 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX064_NX064_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp index f15882fa..7985e474 100644 --- a/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp +++ b/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp @@ -4,7 +4,7 @@ #ifndef KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H -#pragma message("AutoGemm's sgemm_Col_TN_B1_MX096_NX096_KX16_src overriden by user.") +// #pragma message("AutoGemm's sgemm_Col_TN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) diff --git a/src/library/blas/gens/trmm.c b/src/library/blas/gens/trmm.c index 7655af34..0c8f8b4f 100644 --- a/src/library/blas/gens/trmm.c +++ b/src/library/blas/gens/trmm.c @@ -1245,7 +1245,7 @@ static int trmmGetDefaultDecomp( PGranularity *pgran, unsigned int subdimsNum, void *pArgs) { - (void*)subdimsNum; + DUMMY_ARG_USAGE(subdimsNum); if ( NULL == pArgs ) { return -EINVAL; diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp index 1f7c19c1..f3d6ca50 100644 --- a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp index bc9c2961..f039b275 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -64,17 +63,17 @@ uint na)\n {\n if(tx <= i && i+bx*BLOCK_SIZE < na )\n {\n - Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n + Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n }\n else\n {\n Bs[i*BLOCK_SIZE+tx] = ZERO;\n }\n - }\n + }\n // read in the whole square block of my A and zero out the non data triangular - + // Synchronize to make sure the matrices are loaded - //__syncthreads(); + //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n // solve the diagonals @@ -92,7 +91,7 @@ uint na)\n else \n {\n Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n - }\n + }\n }\n barrier(CLK_LOCAL_MEM_FENCE);\n @@ -139,14 +138,14 @@ uint na)\n // __syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n }\n - + // write back A _Pragma("unroll")\n for( i=0; i < BLOCK_SIZE; i++ )\n {\n *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];\n }\n - + }\n // end of kernel ); diff --git a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp index 0ffbebf7..0d81ee20 100644 --- a/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp +++ b/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP -#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -43,10 +42,10 @@ double neg_switcher; \n // Thread index int tx = get_local_id(0); \n -// Thread index +// Thread index int gx = get_global_id(0); \n -// Block index +// Block index int bx = get_group_id(0); \n A = A + offA; \n @@ -56,7 +55,7 @@ int NumBLperNB = NB / BLOCK_SIZE; \n d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n -__local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column +__local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column // load A \n _Pragma("unroll")\n @@ -74,7 +73,7 @@ for (i = 0; i < BLOCK_SIZE; i++)\n // read in the whole square block of my A and zero out the non data triangular // Synchronize to make sure the matrices are loaded -//__syncthreads(); +//__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n // solve the diagonals diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp index c0e3b4cd..f0c041fb 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp @@ -5,7 +5,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ Ain = Ain + offAin; \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n @@ -139,7 +138,7 @@ Ain = Ain + offAin; \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n - + B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp index 71c13dc2..dbffeb95 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp @@ -5,7 +5,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp index 237d3fe1..fd410a9a 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp @@ -1,14 +1,13 @@ /******************************************************************************* * Hand-tuned kernel - + * B21 = -inv(A11)*A12*inv(A22) * 16 to 32 - + ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -76,13 +75,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp index 46b7e970..e4bde337 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -73,13 +72,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2;\n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp index 3358af68..43760b63 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp index e480d6bb..12efa1a4 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp index 6c04dee7..f0df0698 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp index eef824c7..11fa10b6 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -73,13 +72,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0;\n // there is already an overflow on xa + maxA = 0;\n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp index 0f64809d..ad5d5487 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp index 83e0c7e4..cf387855 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp index 5ce3e42a..923f4763 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp index af7f518f..31a97fad 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n - maxA = lda*na; \n // macro READA will detect overflow on y dimension + maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp index 51a3e87a..315908ed 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -74,13 +73,13 @@ int PagesPerNB = NB / (blk * 2); \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n - // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) + // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n - maxA = 0; \n // there is already an overflow on xa + maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp index 674fdd58..a4cd85c6 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp index a45494b5..f13e19b9 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp @@ -7,7 +7,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp index d2077bf1..b576114e 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp @@ -6,7 +6,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp index 004a8d20..adb22d74 100644 --- a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp @@ -8,7 +8,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp index 79bc4c01..4d645bc2 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp @@ -1,14 +1,13 @@ /******************************************************************************* * Hand-tuned kernel - + * B21 = -inv(A11)*A12*inv(A22) * 12 to 24 - + ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp index 6b62eb44..f6465d37 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp index d8c2f992..1e46a8d8 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp index dafa65b9..3dc05169 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -54,7 +53,7 @@ const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY( //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n - //decide invA12 location for each page; + //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n diff --git a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp index 4571112d..37ea0a2a 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp index 8e58ab65..1416ff3c 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ @@ -55,7 +54,7 @@ const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY( //each workgroup loads 1/4 of B (up or down) B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk / 4)*NB; \n - //decide invA12 location for each page; + //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4*1/4 of C C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n diff --git a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp index 2ff217d1..9e961ffe 100644 --- a/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp +++ b/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp @@ -4,7 +4,6 @@ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP -#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ diff --git a/src/library/blas/xgemm.cc b/src/library/blas/xgemm.cc index eb781127..a2c6cb00 100644 --- a/src/library/blas/xgemm.cc +++ b/src/library/blas/xgemm.cc @@ -170,7 +170,7 @@ void makeGemmKernel( #if defined( _WIN32 ) __declspec( thread ) static kernel_map_t *kernel_map = 0; #else - __thread static kernel_map_t *kernel_map = 0; + static __thread kernel_map_t *kernel_map = 0; #endif if (!kernel_map) { kernel_map = new kernel_map_t(); @@ -317,11 +317,11 @@ void makeGemmKernel( * get precision string *****************************************************************************/ template -char * getPrecision(); -template<> char * getPrecision() { return "s"; } -template<> char * getPrecision() { return "d"; } -template<> char * getPrecision() { return "c"; } -template<> char * getPrecision() { return "z"; } +const char * getPrecision(); +template<> const char * getPrecision() { return "s"; } +template<> const char * getPrecision() { return "d"; } +template<> const char * getPrecision() { return "c"; } +template<> const char * getPrecision() { return "z"; } /****************************************************************************** @@ -500,7 +500,7 @@ clblasGemm( &unroll); // make sure gemmSelectKernel found a valid kernel if (!tileKernelSource) { - printf("ERROR: gemmSelectKernel() couldn't find kernel(s) for { order=%s, transA=%s, transB=%s, M=%llu, N=%llu, K=%llu, beta=%u, onept=%f }\n", + printf("ERROR: gemmSelectKernel() couldn't find kernel(s) for { order=%s, transA=%s, transB=%s, M=%u, N=%u, K=%u, beta=%u, onept=%f }\n", order==clblasColumnMajor ? "ColMajor" : "RowMajor", transA==clblasNoTrans ? "N" : transA==clblasTrans ? "T" : "C", transB==clblasNoTrans ? "N" : transB==clblasTrans ? "T" : "C", @@ -566,8 +566,7 @@ clblasGemm( /****************************************************************************** * Build kernels *****************************************************************************/ - - + cl_kernel tileClKernel = NULL; cl_kernel rowClKernel = NULL; cl_kernel colClKernel = NULL; @@ -688,14 +687,14 @@ clblasSgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -748,14 +747,14 @@ clblasDgemm( clblasOrder order, clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -809,14 +808,14 @@ clblasCgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_COMPLEX_FLOAT, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) @@ -870,14 +869,14 @@ clblasZgemm( clblasErr = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + if (K != 0) { //check matrix A clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transA, M, K, A, offA, lda, A_MAT_ERRSET); if (clblasErr != clblasSuccess) return clblasErr; - + //check matrix B clblasErr = checkMatrixSizes(TYPE_COMPLEX_DOUBLE, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET); if (clblasErr != clblasSuccess) diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 9ecfd13e..b3944aa5 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -251,7 +251,7 @@ endif() # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long # http://code.google.com/p/googletest/issues/detail?id=334 if( CMAKE_COMPILER_IS_GNUCXX ) - add_definitions( -Wno-long-long ) + add_definitions( -Wno-long-long -Wno-variadic-macros ) endif( ) if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) @@ -259,7 +259,7 @@ if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) # By default, -Mipa=fast is used, and this does not mix well with the cl compiler string( REPLACE "-Mipa=fast" "" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE} ) - + # In windows, dynamically link to the C runtime, and tell fortran linker to not include default main subroutine if( WIN32 ) set( CMAKE_EXE_LINKER_FLAGS "-Bdynamic -Mnostartup ${CMAKE_EXE_LINKER_FLAGS}" ) @@ -296,7 +296,7 @@ if( GTEST_FOUND ) ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) add_definitions(-DCORR_TEST_WITH_ACML) - + add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) @@ -311,8 +311,8 @@ if( GTEST_FOUND ) set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS) set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) - # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with - # gcc > 4.3.2 to support ACML. + # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with + # gcc > 4.3.2 to support ACML. # If the runtime is being built by the project, use it, otherwise link to a runtime library specified in the install prefix if( BUILD_RUNTIME ) target_link_libraries(test-correctness ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) @@ -350,7 +350,7 @@ if( GTEST_FOUND ) set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran ) set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran ) endif( ) - + if( BUILD_RUNTIME ) if( NETLIB_FOUND ) target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) @@ -373,7 +373,7 @@ if( GTEST_FOUND ) endif( ) endif( ) endif( ) - + set_property( TARGET test-correctness PROPERTY FOLDER "Test") set_property( TARGET test-medium PROPERTY FOLDER "Test") set_property( TARGET test-short PROPERTY FOLDER "Test") @@ -384,7 +384,7 @@ if( GTEST_FOUND ) LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) - + get_target_property( testLocation test-correctness LOCATION ) configure_file( @@ -395,7 +395,7 @@ if( GTEST_FOUND ) # Register script at run at install time to analyze the executable and copy dependencies into package install( SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake") - + if( ACML_FOUND ) include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) @@ -430,7 +430,7 @@ if( GTEST_FOUND ) add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER} ${FUNC_HEADERS} ${TESTS_HEADERS}) - + set_target_properties( test-functional PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( BUILD_RUNTIME ) target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS ) diff --git a/src/tests/correctness/corr-trmv.cpp b/src/tests/correctness/corr-trmv.cpp index 7e97d6c3..a8c7151a 100644 --- a/src/tests/correctness/corr-trmv.cpp +++ b/src/tests/correctness/corr-trmv.cpp @@ -127,7 +127,7 @@ trmvCorrectnessTest(TestParams *params) // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); - bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); + bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_WRITE); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); diff --git a/src/tests/include/cmdline.h b/src/tests/include/cmdline.h index 68ddfba1..b7679732 100644 --- a/src/tests/include/cmdline.h +++ b/src/tests/include/cmdline.h @@ -44,7 +44,7 @@ typedef enum SetoptFlags { SET_INCY = (1 << 9), SET_NUM_COMMAND_QUEUES = (1 << 10), SET_DEVICE_ORD = (1 << 11), - SET_PLATFORM_ORD = (1 << 12), + SET_PLATFORM_ORD = (1 << 12) } SetoptFlags; typedef struct TestParams { diff --git a/src/tests/include/matrix.h b/src/tests/include/matrix.h index 65757add..8794f0b8 100644 --- a/src/tests/include/matrix.h +++ b/src/tests/include/matrix.h @@ -310,21 +310,21 @@ compareMatrices( const cl_double *absDelta = NULL) { size_t m = 0, n = 0; - T a, b; + T ref, clresult; cl_double delta; if( lda > 0 ) // General case { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -336,14 +336,14 @@ compareMatrices( { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -353,14 +353,14 @@ compareMatrices( { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(a, b, delta); + if( module(ref-clresult) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); + ASSERT_NEAR(ref, clresult, delta); } } } @@ -379,23 +379,23 @@ compareMatrices( const cl_double *absDelta) { size_t m = 0, n = 0; - FloatComplex a, b; + FloatComplex ref, clresult; cl_double delta; if ( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -407,16 +407,16 @@ if ( lda > 0 ) { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -426,16 +426,16 @@ if ( lda > 0 ) { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -455,22 +455,22 @@ compareMatrices( const cl_double *absDelta) { size_t m = 0, n = 0; - DoubleComplex a, b; + DoubleComplex ref, clresult; cl_double delta; if( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -482,16 +482,16 @@ if( lda > 0 ) { for( m=n; m < M; m++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -501,16 +501,16 @@ if( lda > 0 ) { for( n = 0; n <= m; n++) { - a = getElement(order, clblasNoTrans, m, n, A, lda); - b = getElement(order, clblasNoTrans, m, n, B, lda); + ref = getElement(order, clblasNoTrans, m, n, A, lda); + clresult = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } - if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) + if( (module(CREAL(ref) - CREAL(clresult)) > delta) || (module(CIMAG(ref) - CIMAG(clresult)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); - ASSERT_NEAR(CREAL(a), CREAL(b), delta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); + ASSERT_NEAR(CREAL(ref), CREAL(clresult), delta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), delta); } } } @@ -764,10 +764,10 @@ template static void compareValues( const T *A, const T *B, const cl_double absDelta=0.0 ) { - T a, b; - a = *A; - b = *B; - ASSERT_NEAR(a, b, absDelta); + T ref, clresult; + ref = *A; + clresult = *B; + ASSERT_NEAR(ref, clresult, absDelta); } template<> @@ -775,12 +775,12 @@ __template_static void compareValues ( const FloatComplex *A, const FloatComplex *B, const cl_double absDelta ) { - FloatComplex a, b; + FloatComplex ref, clresult; - a = *A; - b = *B; - ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); + ref = *A; + clresult = *B; + ASSERT_NEAR(CREAL(ref), CREAL(clresult), absDelta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), absDelta); } template<> @@ -788,11 +788,11 @@ __template_static void compareValues ( const DoubleComplex *A, const DoubleComplex *B, const cl_double absDelta ) { - DoubleComplex a, b; + DoubleComplex ref, clresult; - a = *A; - b = *B; - ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); - ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); + ref = *A; + clresult = *B; + ASSERT_NEAR(CREAL(ref), CREAL(clresult), absDelta); + ASSERT_NEAR(CIMAG(ref), CIMAG(clresult), absDelta); } #endif // MATRIX_H_