votca 2024.2-dev
Loading...
Searching...
No Matches
cudapipeline.h
Go to the documentation of this file.
1/*
2 * Copyright 2009-2020 The VOTCA Development Team
3 * (http://www.votca.org)
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 *
7 * You may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20#ifndef VOTCA_XTP_CUDAPIPELINE_H
21#define VOTCA_XTP_CUDAPIPELINE_H
22
23// CMake generated file
24#include "votca_xtp_config.h"
25#ifndef USE_CUDA
26#error Cuda not enabled
27#endif
28
29#include <type_traits>
30// Local VOTCA includes
31#include "cudamatrix.h"
32
33/*
34 * \brief Perform Tensor-matrix multiplications in a GPU
35 *
36 * The `CudaPipeline` class handles the allocation and deallocation of arrays on
37 * the GPU.
38 */
39
40namespace votca {
41namespace xtp {
42
43/* \brief The CudaPipeline class offload Eigen operations to an *Nvidia* GPU
44 * using the CUDA language. The Cublas handle is the context manager for all the
45 * resources needed by Cublas. While a stream is a queue of sequential
46 * operations executed in the Nvidia device.
47 */
49 public:
50 CudaPipeline(int deviceID) : deviceID_{deviceID} {
51 checkCuda(cudaSetDevice(deviceID));
52 cublasCreate(&handle_);
53 cudaStreamCreate(&stream_);
54 }
55
57 CudaPipeline() = delete;
58 CudaPipeline(const CudaPipeline &) = delete;
60
61 // C= A*b.asDiagonal()
62 template <class M>
63 void diag_gemm(const M &A, const CudaMatrix &b, CudaMatrix &C) const;
64
65 // B+=alpha*A;
66 void axpy(const CudaMatrix &A, CudaMatrix &B, double alpha = 1.0) const;
67
68 template <class M1, class M2, class M3>
69 void gemm(M1 &&A, M2 &&B, M3 &&C, double beta = 0.0) const;
70
71 const cudaStream_t &get_stream() const { return stream_; };
72
73 int getDeviceId() const { return deviceID_; }
74
75 private:
76 int deviceID_ = 0;
77 // The cublas handles allocates hardware resources on the host and device.
78 cublasHandle_t handle_;
79
80 // Asynchronous stream
81
82 cudaStream_t stream_;
83};
84
85/*
86 * Call the gemm function from cublas, resulting in the multiplication of the
87 * two matrices
88 */
89template <class M1, class M2, class M3>
90inline void CudaPipeline::gemm(M1 &&A, M2 &&B, M3 &&C, double beta) const {
91
92 using m1 = std::decay_t<M1>;
93 using m2 = std::decay_t<M2>;
94 using m3 = std::decay_t<M3>;
95 static_assert(!m3::transposed(), "C in gemm cannot be transposed atm");
96 // Scalar constanst for calling blas
97 double alpha = 1.;
98 const double *palpha = &alpha;
99 const double *pbeta = &beta;
100 cublasOperation_t transA = CUBLAS_OP_N;
101 int k = int(A.cols());
102 if (m1::transposed()) {
103 transA = CUBLAS_OP_T;
104 k = int(A.rows());
105 }
106 cublasOperation_t transB = CUBLAS_OP_N;
107 int k2 = int(B.rows());
108 if (m2::transposed()) {
109 transB = CUBLAS_OP_T;
110 k2 = int(B.cols());
111 }
112
113 if (k != k2) {
114 throw std::runtime_error(
115 "Shape mismatch in cuda gemm " + std::to_string(k) + ":" +
116 std::to_string(k2) + " A:" + OutputDimension(A) +
117 " B:" + OutputDimension(B) + " C:" + OutputDimension(C));
118 }
119
120 cublasSetStream(handle_, stream_);
121 cublasStatus_t status =
122 cublasDgemm(handle_, transA, transB, int(C.rows()), int(C.cols()), k,
123 palpha, A.data(), int(A.ld()), B.data(), int(B.ld()), pbeta,
124 C.data(), int(C.ld()));
125 if (status != CUBLAS_STATUS_SUCCESS) {
126 throw std::runtime_error("dgemm failed on gpu " +
127 std::to_string(deviceID_) +
128 " with errorcode:" + cudaGetErrorEnum(status));
129 }
130}
131
132template <class M>
133inline void CudaPipeline::diag_gemm(const M &A, const CudaMatrix &b,
134 CudaMatrix &C) const {
135
136 if (b.cols() != 1 && b.rows() != 1) {
137 throw std::runtime_error("B Matrix in Cublas diag_gemm must be a vector");
138 }
139
140 cublasSideMode_t mode = CUBLAS_SIDE_RIGHT;
141 Index Adim = A.cols();
142 if (M::transposed()) {
143 mode = CUBLAS_SIDE_LEFT;
144 Adim = A.rows();
145 }
146
147 if (Adim != b.size()) {
148 throw std::runtime_error("Shape mismatch in cuda diag_gemm: A" +
149 OutputDimension(A) + " b" + OutputDimension(b));
150 }
151
152 cublasSetStream(handle_, stream_);
153 cublasStatus_t status =
154 cublasDdgmm(handle_, mode, int(A.rows()), int(A.cols()), A.data(),
155 int(A.ld()), b.data(), 1, C.data(), int(C.ld()));
156
157 if (status != CUBLAS_STATUS_SUCCESS) {
158 throw std::runtime_error("diag_gemm failed on gpu " +
159 std::to_string(deviceID_) +
160 " with errorcode:" + cudaGetErrorEnum(status));
161 }
162}
163
164} // namespace xtp
165} // namespace votca
166
167#endif // VOTCA_XTP_CUDAPIPELINE_H
double * data() const
Definition cudamatrix.h:110
CudaPipeline(int deviceID)
CudaPipeline & operator=(const CudaPipeline &)=delete
void gemm(M1 &&A, M2 &&B, M3 &&C, double beta=0.0) const
void axpy(const CudaMatrix &A, CudaMatrix &B, double alpha=1.0) const
const cudaStream_t & get_stream() const
void diag_gemm(const M &A, const CudaMatrix &b, CudaMatrix &C) const
CudaPipeline(const CudaPipeline &)=delete
std::string cudaGetErrorEnum(cublasStatus_t error)
Definition cudamatrix.cc:39
std::string OutputDimension(const M &mat)
Definition cudamatrix.h:53
void checkCuda(cudaError_t result)
Definition cudamatrix.cc:25
base class for all analysis tools
Definition basebead.h:33
Eigen::Index Index
Definition types.h:26