From 8bd602d1932e2971c73604e9942d107f5fcebeb1 Mon Sep 17 00:00:00 2001 From: tim Date: Tue, 14 Jun 2016 16:58:18 -0500 Subject: [PATCH 1/5] (1)update readme: netlib is preferred.(2)now you can verify the correctness of gemm&trmm through client --- README.md | 4 +- src/client/CMakeLists.txt | 8 +- src/client/clfunc_xgemm.hpp | 569 +++++++++++++++----------- src/client/clfunc_xtrmm.hpp | 258 ++++++++---- src/client/client.cpp | 967 ++++++++++++++++++++++---------------------- 5 files changed, 1003 insertions(+), 803 deletions(-) diff --git a/README.md b/README.md index cd734da4..9f4af967 100644 --- a/README.md +++ b/README.md @@ -197,8 +197,10 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate ### Test infrastructure * Googletest v1.6 -* ACML on windows/linux; Accelerate on Mac OSX * Latest Boost +* CPU BLAS + - Netlib CBLAS (recommended: "apt-get install libblas-dev" if on ubuntu) + - or ACML on windows/linux; Accelerate on Mac OSX ### Performance infrastructure * Python diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt index eb66f8a0..e0bc0029 100644 --- a/src/client/CMakeLists.txt +++ b/src/client/CMakeLists.txt @@ -1,12 +1,12 @@ # ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -51,7 +51,7 @@ include_directories( .) add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER}) -target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS) +target_link_libraries(client ${Netlib_LIBRARIES} ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS) set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" OUTPUT_NAME clBLAS-client ) diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp index 8efaf639..1d3d258a 100644 --- a/src/client/clfunc_xgemm.hpp +++ b/src/client/clfunc_xgemm.hpp @@ -21,6 +21,7 @@ #define CLBLAS_BENCHMARK_XGEMM_HXX__ #include "clfunc_common.hpp" +#include "cblas.h" template struct xGemmBuffer @@ -43,14 +44,16 @@ struct xGemmBuffer T* a_; T* b_; T* c_; + T* c_copy; cl_mem buf_a_; cl_mem buf_b_; cl_mem buf_c_; T alpha_; T beta_; - cl_uint apiCallCount; + cl_uint apiCallCount; }; // struct buffer + template class xGemm : public clblasFunc { @@ -68,20 +71,34 @@ class xGemm : public clblasFunc void call_func() { - timer.Start(timer_id); - xGemm_Function(true, buffer_.apiCallCount); - timer.Stop(timer_id); + timer.Start(timer_id); + xGemm_Function(true, buffer_.apiCallCount); + timer.Stop(timer_id); + } + + + void validate_with_cblas(int validate) + { + if(validate) + { + initialize_cpu_buffer(); + initialize_gpu_buffer(); + xGemm_Function(true, 1); + read_gpu_buffer(); + validation(); + } } + double gflops() { - return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); + return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); } - void setup_apiCallCount(cl_uint apiCallCount) - { - buffer_.apiCallCount = apiCallCount; - } + void setup_apiCallCount(cl_uint apiCallCount) + { + buffer_.apiCallCount = apiCallCount; + } std::string gflops_formula() { return "2.0*M*N*K/time"; @@ -322,6 +339,7 @@ class xGemm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; + buffer_.c_copy = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, @@ -366,7 +384,7 @@ class xGemm : public clblasFunc { for (size_t j = 0; j < buffer_.ldc_; ++j) { - buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / + buffer_.c_copy[i*buffer_.ldc_+j] = buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } @@ -375,7 +393,7 @@ class xGemm : public clblasFunc void initialize_gpu_buffer() { - cl_int err; + cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), @@ -408,19 +426,19 @@ class xGemm : public clblasFunc buffer_.c_, 0, NULL, NULL); } - void read_gpu_buffer() - { - cl_int err; - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + void read_gpu_buffer() + { + cl_int err; + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, NULL); - } + buffer_.c_, 0, NULL, NULL); + } - void roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + void roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -452,42 +470,42 @@ class xGemm : public clblasFunc buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); - xGemm_Function(false); - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + xGemm_Function(false); + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void roundtrip_func_rect() - { - timer.Start(timer_id); - cl_int err; - //rect - size_t a_buffer_origin[3] = {0,0,0}; - size_t a_host_origin[3] = {0,0,0}; - size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; - size_t a_buffer_row_pitch=0*sizeof(T);//lda - size_t a_buffer_slice_pitch=0; - size_t a_host_row_pitch=buffer_.lda_*sizeof(T); - size_t a_host_slice_pitch=0; - - size_t b_buffer_origin[3] = {0,0,0}; - size_t b_host_origin[3] = {0,0,0}; - size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; - size_t b_buffer_row_pitch=0*sizeof(T);//ldb - size_t b_buffer_slice_pitch=0; - size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); - size_t b_host_slice_pitch=0; - - size_t c_buffer_origin[3] = {0,0,0}; - size_t c_host_origin[3] = {0,0,0}; - size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; - size_t c_buffer_row_pitch=0*sizeof(T);//ldc - size_t c_buffer_slice_pitch=0; - size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); - size_t c_host_slice_pitch=0; + buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void roundtrip_func_rect() + { + timer.Start(timer_id); + cl_int err; + //rect + size_t a_buffer_origin[3] = {0,0,0}; + size_t a_host_origin[3] = {0,0,0}; + size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; + size_t a_buffer_row_pitch=0*sizeof(T);//lda + size_t a_buffer_slice_pitch=0; + size_t a_host_row_pitch=buffer_.lda_*sizeof(T); + size_t a_host_slice_pitch=0; + + size_t b_buffer_origin[3] = {0,0,0}; + size_t b_host_origin[3] = {0,0,0}; + size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; + size_t b_buffer_row_pitch=0*sizeof(T);//ldb + size_t b_buffer_slice_pitch=0; + size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); + size_t b_host_slice_pitch=0; + + size_t c_buffer_origin[3] = {0,0,0}; + size_t c_host_origin[3] = {0,0,0}; + size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; + size_t c_buffer_row_pitch=0*sizeof(T);//ldc + size_t c_buffer_slice_pitch=0; + size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); + size_t c_host_slice_pitch=0; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_*buffer_.m_ + @@ -504,12 +522,12 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); /* - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); - + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * @@ -522,47 +540,47 @@ class xGemm : public clblasFunc sizeof(T), buffer_.c_, 0, NULL, NULL);*/ err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch, - a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); + a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch, - b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); + b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); - - if(buffer_.trans_a_==clblasNoTrans) - { - buffer_.lda_=buffer_.m_; - } - else - { - buffer_.lda_=buffer_.k_; - } - if(buffer_.trans_b_==clblasNoTrans) - { - buffer_.ldb_=buffer_.k_; - } - else - { - buffer_.ldb_=buffer_.n_; - } - buffer_.ldc_=buffer_.m_; - xGemm_Function(false); - /* - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); + + if(buffer_.trans_a_==clblasNoTrans) + { + buffer_.lda_=buffer_.m_; + } + else + { + buffer_.lda_=buffer_.k_; + } + if(buffer_.trans_b_==clblasNoTrans) + { + buffer_.ldb_=buffer_.k_; + } + else + { + buffer_.ldb_=buffer_.n_; + } + buffer_.ldc_=buffer_.m_; + xGemm_Function(false); + /* + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - */ - err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, - c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void allochostptr_roundtrip_func() - { - timer.Start(timer_id); - - cl_int err; - // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy + buffer_.c_, 0, NULL, &event_); + */ + err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, + c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void allochostptr_roundtrip_func() + { + timer.Start(timer_id); + + cl_int err; + // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -578,45 +596,45 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); - // map the buffers to pointers at host device - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + + // map the buffers to pointers at host device + T *map_a,*map_b,*map_c; + map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + + 0, NULL, NULL, &err); + map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + // memcpy the input A, B, C to the host pointers + memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); + memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + // unmap the buffers + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); + // calling clBLAS + xGemm_Function(false); + // map the C buffer again to read output + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); - - timer.Stop(timer_id); - } - void usehostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + 0, NULL, NULL, &err); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); + clWaitForEvents(1, &event_); + + timer.Stop(timer_id); + } + void usehostptr_roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -631,13 +649,13 @@ class xGemm : public clblasFunc (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); - xGemm_Function(true); - timer.Stop(timer_id); - } - void copyhostptr_roundtrip_func() - { - timer.Start(timer_id); - cl_int err; + xGemm_Function(true); + timer.Stop(timer_id); + } + void copyhostptr_roundtrip_func() + { + timer.Start(timer_id); + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), @@ -652,20 +670,20 @@ class xGemm : public clblasFunc (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); - xGemm_Function(false); - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, - buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * + xGemm_Function(false); + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, + buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), - buffer_.c_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); - } - void usepersismem_roundtrip_func() - { + buffer_.c_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); + } + void usepersismem_roundtrip_func() + { #if defined(CL_MEM_USE_PERSISTENT_MEM_AMD) - timer.Start(timer_id); + timer.Start(timer_id); - cl_int err; + cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.lda_*buffer_.a_num_vectors_ + @@ -682,46 +700,46 @@ class xGemm : public clblasFunc buffer_.offC_) * sizeof(T), NULL, &err); - // map the buffers to pointers at host devices - T *map_a,*map_b,*map_c; - map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.a_num_vectors_ + + // map the buffers to pointers at host devices + T *map_a,*map_b,*map_c; + map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), - 0, NULL, NULL, &err); - map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.ldb_*buffer_.b_num_vectors_ + + 0, NULL, NULL, &err); + map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), - 0, NULL, NULL, &err); - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - // memcpy the input A, B, C to the host pointers - memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); - memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - // unmap the buffers - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); - // calling clBLAS - xGemm_Function(false); - // map the C buffer again to read output - map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, - (buffer_.lda_*buffer_.c_num_vectors_ + + 0, NULL, NULL, &err); + // memcpy the input A, B, C to the host pointers + memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); + memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + // unmap the buffers + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); + // calling clBLAS + xGemm_Function(false); + // map the C buffer again to read output + map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, + (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), - 0, NULL, NULL, &err); - memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); - clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); - clWaitForEvents(1, &event_); + 0, NULL, NULL, &err); + memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); + clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); + clWaitForEvents(1, &event_); - timer.Stop(timer_id); + timer.Stop(timer_id); #else - std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"< buffer_; - void xGemm_Function(bool flush, cl_uint apiCallCount = 1); - unsigned int numQueuesToUse; - cl_event events_[numQueues]; - + void xGemm_Function(bool flush, cl_uint apiCallCount = 1); + unsigned int numQueuesToUse; + cl_event events_[numQueues]; + void validation(); }; // class xgemm template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } - for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, - buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, - buffer_.buf_a_, buffer_.offA_, buffer_.lda_, - buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, - buffer_.beta_, buffer_.buf_c_, buffer_.offC_, - buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); - } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + for (unsigned int i = 0; i < apiCallCount; i++) + { + clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.buf_a_, buffer_.offA_, buffer_.lda_, + buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, + buffer_.beta_, buffer_.buf_c_, buffer_.offC_, + buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); + } + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1025,16 +1044,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1042,18 +1061,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1070,16 +1089,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1087,18 +1106,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1115,16 +1134,16 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } - + for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> -void +void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { @@ -1132,18 +1151,18 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) - { - clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, + { + clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } - //flush==true if only the kernel time (library call) is timed - //flush==false if memory time is also timed - if (flush==true) - { + //flush==true if only the kernel time (library call) is timed + //flush==false if memory time is also timed + if (flush==true) + { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { @@ -1164,8 +1183,8 @@ xGemm_Function(bool flush, cl_uint apiCallCount ) clFlush(queues_[i]); } - clWaitForEvents(numValidEvents, events_); - } + clWaitForEvents(numValidEvents, events_); + } } template<> @@ -1200,4 +1219,78 @@ gflops_formula() return "8.0*M*N*K/time"; } +template<> +void +xGemm:: +validation() +{ + cblas_sgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + buffer_.beta_, + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1); + float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_sgemm against cblas_sgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_dgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + buffer_.beta_, + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.c_, 1, buffer_.c_copy, 1); + double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_dgemm against cblas_dgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_cgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + &(buffer_.beta_), + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cl_float2 neg_one = makeScalar(-1.0); + cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1); + float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_cgemm against cblas_cgemm = %f \n", norm_error); +} + +template<> +void +xGemm:: +validation() +{ + cblas_zgemm(clblasToCblas_order(order_), clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_operation(buffer_.trans_b_), + buffer_.m_, buffer_.n_, buffer_.k_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_ + buffer_.offB_, buffer_.ldb_, + &(buffer_.beta_), + buffer_.c_copy + buffer_.offC_, buffer_.ldc_); + + cl_double2 neg_one = makeScalar(-1.0); + cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.c_, 1, buffer_.c_copy, 1); + double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_copy, 1)/ + cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.c_, 1); + printf("Error of clblas_zgemm against cblas_zgemm = %f \n", norm_error); +} + #endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp index 92d883cf..8125845a 100644 --- a/src/client/clfunc_xtrmm.hpp +++ b/src/client/clfunc_xtrmm.hpp @@ -40,6 +40,7 @@ struct xTrmmBuffer clblasDiag diag_; T* a_; T* b_; + T* b_copy; cl_mem buf_a_; cl_mem buf_b_; T alpha_; @@ -64,6 +65,20 @@ class xTrmm : public clblasFunc std::cout << "xtrmm::call_func\n"; } + + void validate_with_cblas(int validate) + { + if(validate) + { + initialize_cpu_buffer(); + initialize_gpu_buffer(); + call_func(); + read_gpu_buffer(); + validation(); + } + } + + double gflops() { if (buffer_.side_ == clblasLeft) @@ -225,6 +240,7 @@ class xTrmm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; + buffer_.b_copy = new T[buffer_.ldb_*buffer_.b_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, @@ -246,7 +262,7 @@ class xTrmm : public clblasFunc { for (size_t j = 0; j < buffer_.ldb_; ++j) { - buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / + buffer_.b_copy[i*buffer_.ldb_+j] = buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } @@ -294,29 +310,29 @@ class xTrmm : public clblasFunc sizeof(T), buffer_.b_, 0, NULL, NULL); } - void read_gpu_buffer() - { - cl_int err; - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * - sizeof(T), - buffer_.b_, 0, NULL, NULL); - } - void roundtrip_func() - { - std::cout << "xTrmm::roundtrip_func\n"; - } - void zerocopy_roundtrip_func() - { - std::cout << "xTrmm::zerocopy_roundtrip_func\n"; - } - void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, + void read_gpu_buffer() + { + cl_int err; + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * + sizeof(T), + buffer_.b_, 0, NULL, NULL); + } + void roundtrip_func() + { + std::cout << "xTrmm::roundtrip_func\n"; + } + void zerocopy_roundtrip_func() + { + std::cout << "xTrmm::zerocopy_roundtrip_func\n"; + } + void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) - { - DUMMY_ARGS_USAGE_3(transB_option, K, beta); + { + DUMMY_ARGS_USAGE_3(transB_option, K, beta); DUMMY_ARGS_USAGE_2(ldc, offCY); initialize_scalars(alpha, beta); @@ -447,18 +463,20 @@ class xTrmm : public clblasFunc buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; - } - void releaseGPUBuffer_deleteCPUBuffer() - { - //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) - //need to do this before we eventually hit the destructor + } + + void releaseGPUBuffer_deleteCPUBuffer() + { + //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) + //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.b_; + delete buffer_.b_copy; OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_), "releasing buffer B"); - } + } protected: void initialize_scalars(double alpha, double beta) { @@ -468,7 +486,7 @@ class xTrmm : public clblasFunc private: xTrmmBuffer buffer_; - + void validation(); }; // class xTrmm template<> @@ -494,9 +512,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float), @@ -506,8 +524,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float), @@ -518,20 +536,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float), buffer_.b_, 0, NULL, NULL); - //call_func - clblasStrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasStrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -558,9 +576,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double), @@ -570,8 +588,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double), @@ -582,20 +600,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double), buffer_.b_, 0, NULL, NULL); - //call_func - clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -622,9 +640,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float2), @@ -634,8 +652,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float2), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float2), @@ -646,20 +664,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float2), buffer_.b_, 0, NULL, NULL); - //call_func - clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float2), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -686,9 +704,9 @@ void xTrmm:: roundtrip_func() { - timer.Start(timer_id); - cl_int err; - //set up buffer + timer.Start(timer_id); + cl_int err; + //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double2), @@ -698,8 +716,8 @@ roundtrip_func() (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double2), NULL, &err); - //initialize gpu buffer - err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, + //initialize gpu buffer + err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double2), @@ -710,20 +728,20 @@ roundtrip_func() buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double2), buffer_.b_, 0, NULL, NULL); - //call_func - clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, + //call_func + clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); - //read gpu buffer - err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, - buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ * + //read gpu buffer + err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, + buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double2), - buffer_.b_, 0, NULL, &event_); - clWaitForEvents(1, &event_); - timer.Stop(timer_id); + buffer_.b_, 0, NULL, &event_); + clWaitForEvents(1, &event_); + timer.Stop(timer_id); } @@ -790,5 +808,87 @@ gflops_formula() } } +template<> +void +xTrmm:: +validation() +{ + cblas_strmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cblas_saxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1); + float norm_error = cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_snrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_strmm against cblas_strmm = %f \n", norm_error); +} + + +template<> +void +xTrmm:: +validation() +{ + cblas_dtrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, buffer_.alpha_, + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cblas_daxpy(buffer_.lda_ * buffer_.n_, -1.0, buffer_.b_, 1, buffer_.b_copy, 1); + double norm_error = cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_dnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_dtrmm against cblas_dtrmm = %f \n", norm_error); +} + +template<> +void +xTrmm:: +validation() +{ + cblas_ctrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cl_float2 neg_one = makeScalar(-1.0); + cblas_caxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1); + float norm_error = cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_scnrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_ctrmm against cblas_ctrmm = %f \n", norm_error); +} + + +template<> +void +xTrmm:: +validation() +{ + cblas_ztrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_diag(buffer_.diag_), + buffer_.m_, buffer_.n_, &(buffer_.alpha_), + buffer_.a_ + buffer_.offA_, buffer_.lda_, + buffer_.b_copy + buffer_.offB_, buffer_.ldb_); + + cl_double2 neg_one = makeScalar(-1.0); + cblas_zaxpy(buffer_.lda_ * buffer_.n_, &neg_one, buffer_.b_, 1, buffer_.b_copy, 1); + double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ + cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); + printf("Error of clblas_ztrmm against cblas_ztrmm = %f \n", norm_error); + + +} + #endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__ diff --git a/src/client/client.cpp b/src/client/client.cpp index d067c3db..ba9c5fc4 100644 --- a/src/client/client.cpp +++ b/src/client/client.cpp @@ -46,547 +46,552 @@ namespace po = boost::program_options; int main(int argc, char *argv[]) { - size_t M; - size_t N; - size_t K; - cl_double alpha; - cl_double beta; - cl_uint profileCount; - cl_uint apiCallCount; - cl_uint commandQueueFlags = 0; - cl_device_type deviceType = CL_DEVICE_TYPE_GPU; - int order_option; - //clblasOrder order; - //clblasTranspose transA; - //clblasTranspose transB; - int transA_option; - int transB_option; - size_t lda; - size_t ldb; - size_t ldc; - size_t offA; - size_t offBX; - size_t offCY; - std::string function; - std::string precision; - std::string roundtrip; - std::string memalloc; - int side_option; - int uplo_option; - int diag_option; - unsigned int numQueuesToUse; + size_t M; + size_t N; + size_t K; + cl_double alpha; + cl_double beta; + cl_uint profileCount; + cl_uint apiCallCount; + cl_uint commandQueueFlags = 0; + cl_device_type deviceType = CL_DEVICE_TYPE_GPU; + int order_option; + //clblasOrder order; + //clblasTranspose transA; + //clblasTranspose transB; + int transA_option; + int transB_option; + size_t lda; + size_t ldb; + size_t ldc; + size_t offA; + size_t offBX; + size_t offCY; + std::string function; + std::string precision; + std::string roundtrip; + std::string memalloc; + int side_option; + int uplo_option; + int diag_option; + unsigned int numQueuesToUse; + int validate; - po::options_description desc( "clBLAS client command line options" ); - desc.add_options() - ( "help,h", "produces this help message" ) - ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) - ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) - ( "all,a", "Force instantiation of all OpenCL devices" ) - ( "useimages", "Use an image-based kernel" ) - ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) - ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) - ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) - ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) - ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) - ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) - ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object" ) - ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" ) - ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" ) - ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) - ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) - ( "order,o", po::value( &order_option )->default_value(0), "0 = row major, 1 = column major" ) - ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) - ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) - ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) - ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) - ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm - ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm - ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm - ( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" ) - ( "apiCallCount", po::value(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)") - ( "numQueues", po::value(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)") - ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)") - ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem") - ; + po::options_description desc( "clBLAS client command line options" ); + desc.add_options() + ( "help,h", "produces this help message" ) + ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) + ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) + ( "all,a", "Force instantiation of all OpenCL devices" ) + ( "useimages", "Use an image-based kernel" ) + ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) + ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) + ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) + ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) + ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) + ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) + ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object" ) + ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" ) + ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" ) + ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) + ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) + ( "order,o", po::value( &order_option )->default_value(1), "0 = row major, 1 = column major" ) + ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) + ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) + ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) + ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) + ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm + ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm + ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm + ( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" ) + ( "apiCallCount", po::value(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)") + ( "numQueues", po::value(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)") + ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)") + ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem") + ( "validate,v", po::value(&validate)->default_value(0), "Validate GPU results with CPU BLAS? 0 = No, 1 = Yes (default: No): currently only available for gemm and trmm") + ; - po::variables_map vm; - po::store( po::parse_command_line( argc, argv, desc ), vm ); - po::notify( vm ); + po::variables_map vm; + po::store( po::parse_command_line( argc, argv, desc ), vm ); + po::notify( vm ); - if( vm.count( "help" ) ) - { - std::cout << desc << std::endl; - return 0; - } + if( vm.count( "help" ) ) + { + std::cout << desc << std::endl; + return 0; + } - if( function != "gemm" - && function != "trsm" - && function != "trmm" - && function != "gemv" - && function != "symv" - && function != "syrk" - && function != "syr2k" - && function != "trsv" - && function != "trmv" - && function != "ger" - && function != "syr" - && function != "syr2" - && function != "geru" - && function != "gerc" - && function != "her" - && function != "her2" - && function != "hemv" - && function != "hemm" - && function != "symm" - && function != "herk" - && function != "her2k" - ) - { - std::cerr << "Invalid value for --function" << std::endl; - return -1; - } + if( function != "gemm" + && function != "trsm" + && function != "trmm" + && function != "gemv" + && function != "symv" + && function != "syrk" + && function != "syr2k" + && function != "trsv" + && function != "trmv" + && function != "ger" + && function != "syr" + && function != "syr2" + && function != "geru" + && function != "gerc" + && function != "her" + && function != "her2" + && function != "hemv" + && function != "hemm" + && function != "symm" + && function != "herk" + && function != "her2k" + ) + { + std::cerr << "Invalid value for --function" << std::endl; + return -1; + } - if( precision != "s" && precision != "d" && precision != "c" && precision != "z" ) - { - std::cerr << "Invalid value for --precision" << std::endl; - return -1; - } + if( precision != "s" && precision != "d" && precision != "c" && precision != "z" ) + { + std::cerr << "Invalid value for --precision" << std::endl; + return -1; + } - size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) - | ((vm.count( "cpu" ) > 0) ? 2 : 0) - | ((vm.count( "all" ) > 0) ? 4 : 0); - if((mutex & (mutex-1)) != 0) { - std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl; - if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl; - if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl; - if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl; - return 1; - } + size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) + | ((vm.count( "cpu" ) > 0) ? 2 : 0) + | ((vm.count( "all" ) > 0) ? 4 : 0); + if((mutex & (mutex-1)) != 0) { + std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl; + if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl; + if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl; + if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl; + return 1; + } - if( vm.count( "gpu" ) ) - { - deviceType = CL_DEVICE_TYPE_GPU; - } + if( vm.count( "gpu" ) ) + { + deviceType = CL_DEVICE_TYPE_GPU; + } - if( vm.count( "cpu" ) ) - { - deviceType = CL_DEVICE_TYPE_CPU; - } + if( vm.count( "cpu" ) ) + { + deviceType = CL_DEVICE_TYPE_CPU; + } - if( vm.count( "all" ) ) - { - deviceType = CL_DEVICE_TYPE_ALL; - } + if( vm.count( "all" ) ) + { + deviceType = CL_DEVICE_TYPE_ALL; + } - if( profileCount >= 1 ) - { - commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE; - } + if( profileCount >= 1 ) + { + commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE; + } - bool useimages; - if( vm.count("useimages") ) - useimages = true; - else - useimages = false; + bool useimages; + if( vm.count("useimages") ) + useimages = true; + else + useimages = false; - StatisticalTimer& timer = StatisticalTimer::getInstance( ); - timer.Reserve( 3, profileCount ); - timer.setNormalize( true ); + StatisticalTimer& timer = StatisticalTimer::getInstance( ); + timer.Reserve( 3, profileCount ); + timer.setNormalize( true ); - clblasFunc *my_function = NULL; - if (function == "gemm") - { - if (precision == "s") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "d") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "c") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else if (precision == "z") - my_function = new xGemm(timer, deviceType, numQueuesToUse); - else - { - std::cerr << "Unknown gemm function" << std::endl; - return -1; - } - } - else if (function == "trsm") - { - if (precision == "s") - my_function = new xTrsm(timer, deviceType); - else if (precision == "d") - my_function = new xTrsm(timer, deviceType); - else if (precision == "c") - my_function = new xTrsm(timer, deviceType); - else if (precision == "z") - my_function = new xTrsm(timer, deviceType); - else - { - std::cerr << "Unknown trsm function" << std::endl; - return -1; - } - } - else if (function == "trmm") - { - if (precision == "s") - my_function = new xTrmm(timer, deviceType); - else if (precision == "d") - my_function = new xTrmm(timer, deviceType); - else if (precision == "c") - my_function = new xTrmm(timer, deviceType); - else if (precision == "z") - my_function = new xTrmm(timer, deviceType); - else + clblasFunc *my_function = NULL; + if (function == "gemm") { - std::cerr << "Unknown trmm function" << std::endl; - return -1; - } - } - else if (function == "gemv") - { - if (precision == "s") - my_function = new xGemv(timer, deviceType); - else if (precision == "d") - my_function = new xGemv(timer, deviceType); - else if (precision == "c") - my_function = new xGemv(timer, deviceType); - else if (precision == "z") - my_function = new xGemv(timer, deviceType); - else + if (precision == "s") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "d") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "c") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else if (precision == "z") + my_function = new xGemm(timer, deviceType, numQueuesToUse); + else + { + std::cerr << "Unknown gemm function" << std::endl; + return -1; + } + } + else if (function == "trsm") { - std::cerr << "Unknown gemv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrsm(timer, deviceType); + else if (precision == "d") + my_function = new xTrsm(timer, deviceType); + else if (precision == "c") + my_function = new xTrsm(timer, deviceType); + else if (precision == "z") + my_function = new xTrsm(timer, deviceType); + else + { + std::cerr << "Unknown trsm function" << std::endl; + return -1; + } } - } - else if (function == "symv") - { - if (precision == "s") - my_function = new xSymv(timer, deviceType); - else if (precision == "d") - my_function = new xSymv(timer, deviceType); - else + else if (function == "trmm") { - std::cerr << "Unknown symv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrmm(timer, deviceType); + else if (precision == "d") + my_function = new xTrmm(timer, deviceType); + else if (precision == "c") + my_function = new xTrmm(timer, deviceType); + else if (precision == "z") + my_function = new xTrmm(timer, deviceType); + else + { + std::cerr << "Unknown trmm function" << std::endl; + return -1; + } } - } - else if (function == "syrk") - { - if (precision == "s") - my_function = new xSyrk(timer, deviceType); - else if (precision == "d") - my_function = new xSyrk(timer, deviceType); + else if (function == "gemv") + { + if (precision == "s") + my_function = new xGemv(timer, deviceType); + else if (precision == "d") + my_function = new xGemv(timer, deviceType); else if (precision == "c") - my_function = new xSyrk(timer, deviceType); + my_function = new xGemv(timer, deviceType); else if (precision == "z") - my_function = new xSyrk(timer, deviceType); - else + my_function = new xGemv(timer, deviceType); + else + { + std::cerr << "Unknown gemv function" << std::endl; + return -1; + } + } + else if (function == "symv") { - std::cerr << "Unknown syrk function" << std::endl; - return -1; - } - } - else if (function == "syr2k") - { - if (precision == "s") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "d") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "c") - my_function = new xSyr2k(timer, deviceType); - else if (precision == "z") - my_function = new xSyr2k(timer, deviceType); - else + if (precision == "s") + my_function = new xSymv(timer, deviceType); + else if (precision == "d") + my_function = new xSymv(timer, deviceType); + else + { + std::cerr << "Unknown symv function" << std::endl; + return -1; + } + } + else if (function == "syrk") { - std::cerr << "Unknown syr2k function" << std::endl; - return -1; - } - } - else if (function == "trsv") - { - if (precision == "s") - my_function = new xTrsv(timer, deviceType); - else if (precision == "d") - my_function = new xTrsv(timer, deviceType); - else if (precision == "c") - my_function = new xTrsv(timer, deviceType); - else if (precision == "z") - my_function = new xTrsv(timer, deviceType); - else + if (precision == "s") + my_function = new xSyrk(timer, deviceType); + else if (precision == "d") + my_function = new xSyrk(timer, deviceType); + else if (precision == "c") + my_function = new xSyrk(timer, deviceType); + else if (precision == "z") + my_function = new xSyrk(timer, deviceType); + else + { + std::cerr << "Unknown syrk function" << std::endl; + return -1; + } + } + else if (function == "syr2k") { - std::cerr << "Unknown trsv function" << std::endl; - return -1; - } - } - else if (function == "trmv") - { - if (precision == "s") - my_function = new xTrmv(timer, deviceType); - else if (precision == "d") - my_function = new xTrmv(timer, deviceType); - else if (precision == "c") - my_function = new xTrmv(timer, deviceType); - else if (precision == "z") - my_function = new xTrmv(timer, deviceType); - else + if (precision == "s") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "d") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "c") + my_function = new xSyr2k(timer, deviceType); + else if (precision == "z") + my_function = new xSyr2k(timer, deviceType); + else + { + std::cerr << "Unknown syr2k function" << std::endl; + return -1; + } + } + else if (function == "trsv") + { + if (precision == "s") + my_function = new xTrsv(timer, deviceType); + else if (precision == "d") + my_function = new xTrsv(timer, deviceType); + else if (precision == "c") + my_function = new xTrsv(timer, deviceType); + else if (precision == "z") + my_function = new xTrsv(timer, deviceType); + else + { + std::cerr << "Unknown trsv function" << std::endl; + return -1; + } + } + else if (function == "trmv") { - std::cerr << "Unknown trmv function" << std::endl; - return -1; + if (precision == "s") + my_function = new xTrmv(timer, deviceType); + else if (precision == "d") + my_function = new xTrmv(timer, deviceType); + else if (precision == "c") + my_function = new xTrmv(timer, deviceType); + else if (precision == "z") + my_function = new xTrmv(timer, deviceType); + else + { + std::cerr << "Unknown trmv function" << std::endl; + return -1; + } } - } - else if (function == "ger") - { - if (precision == "s") - my_function = new xGer(timer, deviceType); - else if (precision == "d") - my_function = new xGer(timer, deviceType); - else + else if (function == "ger") { - std::cerr << "Unknown ger function" << std::endl; - return -1; + if (precision == "s") + my_function = new xGer(timer, deviceType); + else if (precision == "d") + my_function = new xGer(timer, deviceType); + else + { + std::cerr << "Unknown ger function" << std::endl; + return -1; + } } - } - else if (function == "syr") - { - if (precision == "s") - my_function = new xSyr(timer, deviceType); - else if (precision == "d") - my_function = new xSyr(timer, deviceType); - else + else if (function == "syr") { - std::cerr << "Unknown syr function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSyr(timer, deviceType); + else if (precision == "d") + my_function = new xSyr(timer, deviceType); + else + { + std::cerr << "Unknown syr function" << std::endl; + return -1; + } } - } - else if (function == "syr2") - { - if (precision == "s") - my_function = new xSyr2(timer, deviceType); - else if (precision == "d") - my_function = new xSyr2(timer, deviceType); - else + else if (function == "syr2") { - std::cerr << "Unknown syr2 function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSyr2(timer, deviceType); + else if (precision == "d") + my_function = new xSyr2(timer, deviceType); + else + { + std::cerr << "Unknown syr2 function" << std::endl; + return -1; + } } - } - else if (function == "geru") - { - if (precision == "c") - my_function = new xGeru(timer, deviceType); - else if (precision == "z") - my_function = new xGeru(timer, deviceType); - else + else if (function == "geru") { - std::cerr << "Unknown geru function" << std::endl; - return -1; + if (precision == "c") + my_function = new xGeru(timer, deviceType); + else if (precision == "z") + my_function = new xGeru(timer, deviceType); + else + { + std::cerr << "Unknown geru function" << std::endl; + return -1; + } } - } - else if (function == "gerc") - { - if (precision == "c") - my_function = new xGerc(timer, deviceType); - else if (precision == "z") - my_function = new xGerc(timer, deviceType); - else + else if (function == "gerc") { - std::cerr << "Unknown gerc function" << std::endl; - return -1; + if (precision == "c") + my_function = new xGerc(timer, deviceType); + else if (precision == "z") + my_function = new xGerc(timer, deviceType); + else + { + std::cerr << "Unknown gerc function" << std::endl; + return -1; + } } - } - else if (function == "her") - { - if (precision == "c") - my_function = new xHer(timer, deviceType); - else if (precision == "z") - my_function = new xHer(timer, deviceType); - else + else if (function == "her") { - std::cerr << "Unknown her function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHer(timer, deviceType); + else if (precision == "z") + my_function = new xHer(timer, deviceType); + else + { + std::cerr << "Unknown her function" << std::endl; + return -1; + } } - } - else if (function == "her2") - { - if (precision == "c") - my_function = new xHer2(timer, deviceType); - else if (precision == "z") - my_function = new xHer2(timer, deviceType); - else + else if (function == "her2") { - std::cerr << "Unknown her2 function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHer2(timer, deviceType); + else if (precision == "z") + my_function = new xHer2(timer, deviceType); + else + { + std::cerr << "Unknown her2 function" << std::endl; + return -1; + } } - } - else if (function == "hemv") - { - if (precision == "c") - my_function = new xHemv(timer, deviceType); - else if (precision == "z") - my_function = new xHemv(timer, deviceType); - else + else if (function == "hemv") { - std::cerr << "Unknown hemv function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHemv(timer, deviceType); + else if (precision == "z") + my_function = new xHemv(timer, deviceType); + else + { + std::cerr << "Unknown hemv function" << std::endl; + return -1; + } } - } - else if (function == "hemm") - { - if (precision == "c") - my_function = new xHemm(timer, deviceType); - else if (precision == "z") - my_function = new xHemm(timer, deviceType); - else + else if (function == "hemm") { - std::cerr << "Unknown hemm function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHemm(timer, deviceType); + else if (precision == "z") + my_function = new xHemm(timer, deviceType); + else + { + std::cerr << "Unknown hemm function" << std::endl; + return -1; + } } - } - else if (function == "herk") - { - if (precision == "c") - my_function = new xHerk(timer, deviceType); - else if (precision == "z") - my_function = new xHerk(timer, deviceType); - else + else if (function == "herk") { - std::cerr << "Unknown her function" << std::endl; - return -1; + if (precision == "c") + my_function = new xHerk(timer, deviceType); + else if (precision == "z") + my_function = new xHerk(timer, deviceType); + else + { + std::cerr << "Unknown her function" << std::endl; + return -1; + } } - } - else if (function == "her2k") - { - if (precision == "c") - my_function = new xHer2k(timer, deviceType); - else if (precision == "z") - my_function = new xHer2k(timer, deviceType); - else + else if (function == "her2k") { - std::cerr << "Unknown her2 function" << std::endl; - return -1; - } - } - else if (function == "symm") - { - if (precision == "s") - my_function = new xSymm(timer, deviceType); - else if (precision == "d") - my_function = new xSymm(timer, deviceType); - else if (precision == "c") - my_function = new xSymm(timer, deviceType); - else if (precision == "z") - my_function = new xSymm(timer, deviceType); - else + if (precision == "c") + my_function = new xHer2k(timer, deviceType); + else if (precision == "z") + my_function = new xHer2k(timer, deviceType); + else + { + std::cerr << "Unknown her2 function" << std::endl; + return -1; + } + } + else if (function == "symm") { - std::cerr << "Unknown symm function" << std::endl; - return -1; + if (precision == "s") + my_function = new xSymm(timer, deviceType); + else if (precision == "d") + my_function = new xSymm(timer, deviceType); + else if (precision == "c") + my_function = new xSymm(timer, deviceType); + else if (precision == "z") + my_function = new xSymm(timer, deviceType); + else + { + std::cerr << "Unknown symm function" << std::endl; + return -1; + } } - } - try - { - my_function->setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + try + { + my_function->setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - - my_function->initialize_cpu_buffer(); - my_function->initialize_gpu_buffer(); - my_function->setup_apiCallCount(apiCallCount); - my_function->call_func(); // do a calculation first to get any compilation out of the way - my_function->reset_gpu_write_buffer(); // reset GPU write buffer - } - catch( std::exception& exc ) - { - std::cerr << exc.what( ) << std::endl; - return 1; - } - if(roundtrip=="roundtrip"||roundtrip=="both") - { - timer.Reset(); - for( cl_uint i = 0; i < profileCount; ++i ) - { - my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + my_function->initialize_cpu_buffer(); + my_function->initialize_gpu_buffer(); + my_function->setup_apiCallCount(apiCallCount); + my_function->call_func(); // do a calculation first to get any compilation out of the way + my_function->reset_gpu_write_buffer(); // reset GPU write buffer + } + catch( std::exception& exc ) + { + std::cerr << exc.what( ) << std::endl; + return 1; + } + if(roundtrip=="roundtrip"||roundtrip=="both") + { + timer.Reset(); + for( cl_uint i = 0; i < profileCount; ++i ) + { + my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - my_function->initialize_cpu_buffer(); - /*my_function->initialize_gpu_buffer(); - my_function->call_func(); - my_function->read_gpu_buffer(); - my_function->reset_gpu_write_buffer();*/ - - if(memalloc=="default") - { - my_function->roundtrip_func(); - } - else if (memalloc=="alloc_host_ptr") - { - my_function->allochostptr_roundtrip_func(); - } - else if (memalloc=="use_host_ptr") - { - my_function->usehostptr_roundtrip_func(); - } - else if (memalloc=="copy_host_ptr") - { - my_function->copyhostptr_roundtrip_func(); - } - else if (memalloc=="use_persistent_mem_amd") - { - my_function->usepersismem_roundtrip_func(); - } - else if (memalloc=="rect_mem") - { - my_function->roundtrip_func_rect(); - } - //my_function->reset_gpu_write_buffer(); - my_function->releaseGPUBuffer_deleteCPUBuffer(); - } + my_function->initialize_cpu_buffer(); + /*my_function->initialize_gpu_buffer(); + my_function->call_func(); + my_function->read_gpu_buffer(); + my_function->reset_gpu_write_buffer();*/ + + if(memalloc=="default") + { + my_function->roundtrip_func(); + } + else if (memalloc=="alloc_host_ptr") + { + my_function->allochostptr_roundtrip_func(); + } + else if (memalloc=="use_host_ptr") + { + my_function->usehostptr_roundtrip_func(); + } + else if (memalloc=="copy_host_ptr") + { + my_function->copyhostptr_roundtrip_func(); + } + else if (memalloc=="use_persistent_mem_amd") + { + my_function->usepersismem_roundtrip_func(); + } + else if (memalloc=="rect_mem") + { + my_function->roundtrip_func_rect(); + } + //my_function->reset_gpu_write_buffer(); + my_function->releaseGPUBuffer_deleteCPUBuffer(); + } - if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) - { - //std::cout << timer << std::endl; - timer.pruneOutliers( 3.0 ); - std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl; - std::cout << "BLAS (round trip) execution Gflops < " << - my_function->gflops_formula() << " >: " << my_function->gflops() << - std::endl; - } - } - if(roundtrip=="noroundtrip"||roundtrip=="both") - { - timer.Reset(); - my_function->setup_buffer( order_option, side_option, uplo_option, - diag_option, transA_option, transB_option, + if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) + { + //std::cout << timer << std::endl; + timer.pruneOutliers( 3.0 ); + std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl; + std::cout << "BLAS (round trip) execution Gflops < " << + my_function->gflops_formula() << " >: " << my_function->gflops() << + std::endl; + } + } + if(roundtrip=="noroundtrip"||roundtrip=="both") + { + timer.Reset(); + my_function->setup_buffer( order_option, side_option, uplo_option, + diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); - my_function->initialize_cpu_buffer(); - my_function->initialize_gpu_buffer(); - my_function->setup_apiCallCount( apiCallCount ); + my_function->initialize_cpu_buffer(); + my_function->initialize_gpu_buffer(); + my_function->setup_apiCallCount( apiCallCount ); + + for (cl_uint i = 0; i < profileCount; ++i) - { - my_function->call_func(); - } - my_function->read_gpu_buffer(); - //my_function->reset_gpu_write_buffer(); - my_function->releaseGPUBuffer_deleteCPUBuffer(); + { + my_function->call_func(); + } + my_function->read_gpu_buffer(); - if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) - { - //std::cout << timer << std::endl; - timer.pruneOutliers( 3.0 ); - std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl; - std::cout << "BLAS kernel execution Gflops < " << - my_function->gflops_formula() << " >: " << my_function->gflops() << - std::endl; - } - } - delete my_function; - return 0; -} + my_function->validate_with_cblas(validate); + + //my_function->reset_gpu_write_buffer(); + my_function->releaseGPUBuffer_deleteCPUBuffer(); + if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) + { + //std::cout << timer << std::endl; + timer.pruneOutliers( 3.0 ); + std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl; + std::cout << "BLAS kernel execution Gflops < " << + my_function->gflops_formula() << " >: " << my_function->gflops() << + std::endl; + } + } + delete my_function; + return 0; +} From 4ca856ea0fbabb79f0a1c45fa6c112401f050950 Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 15 Jun 2016 11:06:54 -0500 Subject: [PATCH 2/5] give more details of how to get CBLAS on windows --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f4af967..8de7d7ec 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,9 @@ The simple example below shows how to use clBLAS to compute an OpenCL accelerate * Googletest v1.6 * Latest Boost * CPU BLAS - - Netlib CBLAS (recommended: "apt-get install libblas-dev" if on ubuntu) + - Netlib CBLAS (recommended) + Ubuntu: install by "apt-get install libblas-dev" + Windows: download & install lapack-3.6.0 which comes with CBLAS - or ACML on windows/linux; Accelerate on Mac OSX ### Performance infrastructure From 5ae16767f19e2b88f00a6af1e8f08750f74e39ed Mon Sep 17 00:00:00 2001 From: tim Date: Wed, 15 Jun 2016 14:34:23 -0500 Subject: [PATCH 3/5] find the netlib library dir & library in Cmake files --- src/CMakeLists.txt | 8 ++++++++ src/FindNetlib.cmake | 19 +++++++++++++++++++ src/client/CMakeLists.txt | 1 + 3 files changed, 28 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33a91ee2..73ba594e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -265,6 +265,14 @@ if( BUILD_TEST ) endif( ) endif( ) +if( BUILD_CLIENT ) + if( NETLIB_FOUND ) + else( ) + message( WARNING "Not find Netlib; BUILD_CLIENT needs the Netlib CBLAS library" ) + endif() +endif() + + # This will define OPENCL_FOUND find_package( OpenCL ${OPENCL_VERSION} ) diff --git a/src/FindNetlib.cmake b/src/FindNetlib.cmake index a32474ed..6a21e613 100644 --- a/src/FindNetlib.cmake +++ b/src/FindNetlib.cmake @@ -100,6 +100,25 @@ if( NOT contains_BLAS EQUAL -1 ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY ) endif( ) + +#look for netlib cblas header +if( UNIX ) + find_path(Netlib_INCLUDE_DIRS cblas.h + HINTS + /usr/include + ) +else() + find_path(Netlib_INCLUDE_DIRS cblas.h + HINTS + ${Netlib_ROOT}/CBLAS/include/ + ) +endif() + +if( Netlib_INCLUDE_DIRS ) +else() + message(WARNING "Cannot find cblas.h") +endif() + if( NETLIB_FOUND ) list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} ) else( ) diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt index e0bc0029..752b19b3 100644 --- a/src/client/CMakeLists.txt +++ b/src/client/CMakeLists.txt @@ -48,6 +48,7 @@ include_directories( ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/tests/include + ${Netlib_INCLUDE_DIRS} .) add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER}) From 0c4d2665aec68f183558c9b9a5033c25e2832c42 Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 16 Jun 2016 10:18:42 -0500 Subject: [PATCH 4/5] forget to add this file --- src/client/clfunc_common.hpp | 89 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp index fc2057ba..99ae1c43 100644 --- a/src/client/clfunc_common.hpp +++ b/src/client/clfunc_common.hpp @@ -26,6 +26,7 @@ #include "blas-math.h" #include "test-limits.h" #include "dis_warning.h" +#include "cblas.h" #include "clBLAS.h" #if defined(__APPLE__) || defined(__MACOSX) @@ -77,6 +78,52 @@ randomScale() return t; } +CBLAS_ORDER +clblasToCblas_order(clblasOrder value) +{ + switch (value) { + case clblasRowMajor: return CblasRowMajor; + case clblasColumnMajor: return CblasColMajor; + } +} + +CBLAS_TRANSPOSE +clblasToCblas_operation(clblasTranspose value) +{ + switch (value) { + case clblasNoTrans: return CblasNoTrans; + case clblasTrans: return CblasTrans; + case clblasConjTrans: return CblasConjTrans; + } +} + +CBLAS_UPLO +clblasToCblas_fill(clblasUplo value) +{ + switch (value) { + case clblasUpper: return CblasUpper; + case clblasLower: return CblasLower; + } +} + +CBLAS_SIDE +clblasToCblas_side(clblasSide value) +{ + switch (value) { + case clblasLeft: return CblasLeft; + case clblasRight: return CblasRight; + } +} + +CBLAS_DIAG +clblasToCblas_diag(clblasDiag value) +{ + switch (value) { + case clblasNonUnit: return CblasNonUnit; + case clblasUnit: return CblasUnit; + } +} + std::string prettyPrintClStatus( const cl_int& status ) { @@ -269,7 +316,7 @@ class clblasFunc virtual ~clblasFunc() { clblasTeardown(); - + for (unsigned int i = 0; i < numQueues; i++) { OPENCL_V_THROW( clReleaseCommandQueue(queues_[i]), "releasing command queue" ); } @@ -278,21 +325,21 @@ class clblasFunc void wait_and_check() { - cl_int err; + cl_int err; cl_int wait_status = clWaitForEvents(1, &event_); if( wait_status != CL_SUCCESS ) { - if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) - { - clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, + if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) + { + clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &err, NULL ); - std::cout << "blas function execution status error: " << err << std::endl; + std::cout << "blas function execution status error: " << err << std::endl; exit(1); - } + } else { - std::cout << "blas function wait status error: " << wait_status << std::endl; + std::cout << "blas function wait status error: " << wait_status << std::endl; exit(1); } } @@ -300,14 +347,16 @@ class clblasFunc double time_in_ns() { - StatisticalTimer& timer = StatisticalTimer::getInstance( ); + StatisticalTimer& timer = StatisticalTimer::getInstance( ); return timer.getAverageTime( timer_id ) * 1e9; } + virtual void validate_with_cblas(int v) {} + virtual void call_func() = 0; virtual double gflops() = 0; virtual std::string gflops_formula() = 0; - virtual void setup_apiCallCount(cl_uint apiCallCount){} + virtual void setup_apiCallCount(cl_uint apiCallCount){} virtual void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, @@ -317,20 +366,20 @@ class clblasFunc virtual void initialize_cpu_buffer() = 0; virtual void initialize_gpu_buffer() = 0; virtual void reset_gpu_write_buffer() = 0; - virtual void read_gpu_buffer() = 0; - virtual void roundtrip_func() = 0; - virtual void roundtrip_func_rect() {} - virtual void allochostptr_roundtrip_func() {} - virtual void usehostptr_roundtrip_func() {} - virtual void copyhostptr_roundtrip_func() {} - virtual void usepersismem_roundtrip_func() {} - virtual void roundtrip_setup_buffer(int order_option, int side_option, + virtual void read_gpu_buffer() = 0; + virtual void roundtrip_func() = 0; + virtual void roundtrip_func_rect() {} + virtual void allochostptr_roundtrip_func() {} + virtual void usehostptr_roundtrip_func() {} + virtual void copyhostptr_roundtrip_func() {} + virtual void usepersismem_roundtrip_func() {} + virtual void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) = 0; - virtual void releaseGPUBuffer_deleteCPUBuffer()=0; + virtual void releaseGPUBuffer_deleteCPUBuffer()=0; StatisticalTimer& timer; StatisticalTimer::sTimerID timer_id; @@ -347,7 +396,7 @@ class clblasFunc clblasOrder order_; cl_event event_; size_t maxMemAllocSize; + int validate_; }; // class clblasFunc #endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__ - From 8415f31f7fa26eb19cc3183745f46f2633d2cb6c Mon Sep 17 00:00:00 2001 From: tim Date: Thu, 16 Jun 2016 13:55:05 -0500 Subject: [PATCH 5/5] disable the validation on windows currently: no easy solution of building/linking netlib CBLAS on windows --- src/client/clfunc_common.hpp | 11 ++++++++++- src/client/clfunc_xgemm.hpp | 18 +++++++++++++++--- src/client/clfunc_xtrmm.hpp | 28 ++++++++++++++++++---------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp index 99ae1c43..0f22ef0f 100644 --- a/src/client/clfunc_common.hpp +++ b/src/client/clfunc_common.hpp @@ -26,7 +26,11 @@ #include "blas-math.h" #include "test-limits.h" #include "dis_warning.h" + +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else #include "cblas.h" +#endif #include "clBLAS.h" #if defined(__APPLE__) || defined(__MACOSX) @@ -78,6 +82,9 @@ randomScale() return t; } +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + CBLAS_ORDER clblasToCblas_order(clblasOrder value) { @@ -124,6 +131,8 @@ clblasToCblas_diag(clblasDiag value) } } +#endif + std::string prettyPrintClStatus( const cl_int& status ) { @@ -352,7 +361,7 @@ class clblasFunc } virtual void validate_with_cblas(int v) {} - + virtual void call_func() = 0; virtual double gflops() = 0; virtual std::string gflops_formula() = 0; diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp index 1d3d258a..57c283de 100644 --- a/src/client/clfunc_xgemm.hpp +++ b/src/client/clfunc_xgemm.hpp @@ -21,7 +21,6 @@ #define CLBLAS_BENCHMARK_XGEMM_HXX__ #include "clfunc_common.hpp" -#include "cblas.h" template struct xGemmBuffer @@ -76,10 +75,12 @@ class xGemm : public clblasFunc timer.Stop(timer_id); } - + void validate_with_cblas(int validate) { - if(validate) + #if defined ( _WIN32 ) || defined ( _WIN64 ) + #else + if(validate) { initialize_cpu_buffer(); initialize_gpu_buffer(); @@ -87,6 +88,7 @@ class xGemm : public clblasFunc read_gpu_buffer(); validation(); } + #endif } @@ -1004,7 +1006,11 @@ class xGemm : public clblasFunc void xGemm_Function(bool flush, cl_uint apiCallCount = 1); unsigned int numQueuesToUse; cl_event events_[numQueues]; + +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else void validation(); +#endif }; // class xgemm template<> @@ -1219,6 +1225,10 @@ gflops_formula() return "8.0*M*N*K/time"; } +#if defined ( _WIN32 ) || defined (_WIN64 ) + +#else + template<> void xGemm:: @@ -1293,4 +1303,6 @@ validation() printf("Error of clblas_zgemm against cblas_zgemm = %f \n", norm_error); } +#endif + #endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp index 8125845a..0cd1ff46 100644 --- a/src/client/clfunc_xtrmm.hpp +++ b/src/client/clfunc_xtrmm.hpp @@ -68,6 +68,8 @@ class xTrmm : public clblasFunc void validate_with_cblas(int validate) { + #if defined ( _WIN32 ) || defined ( _WIN64 ) + #else if(validate) { initialize_cpu_buffer(); @@ -76,6 +78,7 @@ class xTrmm : public clblasFunc read_gpu_buffer(); validation(); } + #endif } @@ -486,7 +489,10 @@ class xTrmm : public clblasFunc private: xTrmmBuffer buffer_; +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else void validation(); +#endif }; // class xTrmm template<> @@ -808,14 +814,17 @@ gflops_formula() } } +#if defined ( _WIN32 ) || defined ( _WIN64 ) +#else + template<> void xTrmm:: validation() { cblas_strmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), - clblasToCblas_fill(buffer_.uplo_), - clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_diag(buffer_.diag_), buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.a_ + buffer_.offA_, buffer_.lda_, @@ -834,8 +843,8 @@ xTrmm:: validation() { cblas_dtrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), - clblasToCblas_fill(buffer_.uplo_), - clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_diag(buffer_.diag_), buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.a_ + buffer_.offA_, buffer_.lda_, @@ -853,8 +862,8 @@ xTrmm:: validation() { cblas_ctrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), - clblasToCblas_fill(buffer_.uplo_), - clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_diag(buffer_.diag_), buffer_.m_, buffer_.n_, &(buffer_.alpha_), buffer_.a_ + buffer_.offA_, buffer_.lda_, @@ -874,8 +883,8 @@ xTrmm:: validation() { cblas_ztrmm(clblasToCblas_order(order_), clblasToCblas_side(buffer_.side_), - clblasToCblas_fill(buffer_.uplo_), - clblasToCblas_operation(buffer_.trans_a_), + clblasToCblas_fill(buffer_.uplo_), + clblasToCblas_operation(buffer_.trans_a_), clblasToCblas_diag(buffer_.diag_), buffer_.m_, buffer_.n_, &(buffer_.alpha_), buffer_.a_ + buffer_.offA_, buffer_.lda_, @@ -886,9 +895,8 @@ validation() double norm_error = cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_copy, 1)/ cblas_dznrm2(buffer_.lda_ * buffer_.n_, buffer_.b_, 1); printf("Error of clblas_ztrmm against cblas_ztrmm = %f \n", norm_error); - - } +#endif #endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__