votca 2024.2-dev
Loading...
Searching...
No Matches
openmp_cuda.cc
Go to the documentation of this file.
1/*
2 * Copyright 2009-2020 The VOTCA Development Team
3 * (http://www.votca.org)
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 *
7 * You may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20// Local VOTCA includes
22
23namespace votca {
24namespace xtp {
25
26// Has to be declared because of
27// https://stackoverflow.com/questions/9110487/undefined-reference-to-a-static-member
29
31
33#ifdef USE_CUDA
34 return count_available_gpus();
35#else
36 return 0;
37#endif
38}
39
41 if (number < 0 || number > AvailableGPUs()) {
43 } else {
44 number_of_gpus = number;
45 }
46}
47
49
52
53 cpus_.resize(getNumberThreads());
54
55#ifdef USE_CUDA
56 Index no_gpus = UsingGPUs();
57 gpus_.clear();
59 if (threadID_parent_ < no_gpus) {
60 gpus_.push_back(GPU_data(threadID_parent_));
61 }
62 } else {
63 if (no_gpus > getNumberThreads()) {
64 no_gpus = getNumberThreads();
65 }
66 for (Index i = 0; i < no_gpus; i++) {
67 gpus_.push_back(GPU_data(i));
68 }
69 }
70#endif
71}
72
73#ifdef USE_CUDA
74void OpenMP_CUDA::setOperators(const std::vector<Eigen::MatrixXd>& tensor,
75 const Eigen::MatrixXd& rightoperator) {
76 rOP_ = rightoperator;
77
78#pragma omp parallel for num_threads(gpus_.size())
79 for (Index i = 0; i < Index(gpus_.size()); i++) {
80 GPU_data& gpu = gpus_[i];
81 gpu.activateGPU();
82 const Eigen::MatrixXd& head = tensor.front();
83 gpu.push_back(head.rows(), head.cols());
84 gpu.push_back(rightoperator);
85 gpu.push_back(head.rows(), rightoperator.cols());
86 }
87}
88#else
89void OpenMP_CUDA::setOperators(const std::vector<Eigen::MatrixXd>&,
90 const Eigen::MatrixXd& rightoperator) {
91 rOP_ = rightoperator;
92}
93#endif
94
95#ifdef USE_CUDA
96bool OpenMP_CUDA::isInVector(Index Id, const std::vector<GPU_data>& vec) {
97 return (std::find_if(vec.begin(), vec.end(), [&Id](const GPU_data& d) {
98 return d.Id == Id;
99 }) != vec.end());
100}
101bool OpenMP_CUDA::isGPUthread(Index ParentThreadId) const {
102 return isInVector(ParentThreadId, gpus_);
103}
104#endif
105
107 return inside_Parallel_region_ ? threadID_parent_ : OpenmpThreadId;
108}
110 return inside_Parallel_region_ ? 0 : ParentThreadId;
111}
115
116/*
117 * The Cuda device behaves like a server that is receiving matrix-matrix
118 * multiplications from a single stream (an Nvidia queue) and handle them
119 * in an asynchronous way. It performs the following operations when recieving
120 * a request:
121 * 1. Check that there is enough space for the arrays
122 * 2. Allocate memory for each matrix
123 * 3. Copy the matrix to the allocated space
124 * 4. Perform the matrix multiplication
125 * 5. Return the result matrix
126 * The Cuda device knows to which memory address it needs to copy back the
127 * result. see: https://docs.nvidia.com/cuda/cublas/index.html#thread-safety2
128 */
129
130#ifdef USE_CUDA
131void OpenMP_CUDA::MultiplyRight(Eigen::MatrixXd& tensor, Index OpenmpThread) {
132
133 Index threadid = getParentThreadId(OpenmpThread);
134 if (isGPUthread(threadid)) {
135 GPU_data& gpu = gpus_[getLocalThreadId(threadid)];
136 gpu.activateGPU();
137 gpu.Mat(0).copy_to_gpu(tensor);
138 gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));
139 tensor = gpu.Mat(2);
140 } else {
141 tensor *= rOP_();
142 }
143 return;
144}
145
146#else
147void OpenMP_CUDA::MultiplyRight(Eigen::MatrixXd& tensor, Index) {
148 tensor *= rOP_();
149}
150#endif
151
152void OpenMP_CUDA::setOperators(const Eigen::MatrixXd& leftoperator,
153 const Eigen::MatrixXd& rightoperator) {
154 lOP_ = leftoperator;
155 rOP_ = rightoperator;
156
157#ifdef USE_CUDA
158#pragma omp parallel for num_threads(gpus_.size())
159 for (Index i = 0; i < Index(gpus_.size()); i++) {
160 GPU_data& gpu = gpus_[i];
161 gpu.activateGPU();
162 gpu.push_back(leftoperator);
163 gpu.push_back(leftoperator.cols(), rightoperator.rows());
164 gpu.push_back(leftoperator.rows(), rightoperator.rows());
165 gpu.push_back(rightoperator);
166 gpu.push_back(leftoperator.rows(), rightoperator.cols());
167 }
168#endif
169}
170
171#ifdef USE_CUDA
172void OpenMP_CUDA::MultiplyLeftRight(Eigen::MatrixXd& matrix,
173 Index OpenmpThread) {
174
175 Index threadid = getParentThreadId(OpenmpThread);
176 if (isGPUthread(threadid)) {
177 GPU_data& gpu = gpus_[getLocalThreadId(threadid)];
178 gpu.activateGPU();
179 gpu.Mat(1).copy_to_gpu(matrix);
180 gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));
181 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3), gpu.Mat(4));
182 matrix = gpu.Mat(4);
183 } else {
184 matrix = lOP_() * matrix * rOP_();
185 }
186 return;
187}
188#else
189void OpenMP_CUDA::MultiplyLeftRight(Eigen::MatrixXd& matrix, Index) {
190 matrix = lOP_() * matrix * rOP_();
191}
192#endif
193
194#ifdef USE_CUDA
196
197 std::for_each(cpus_.begin(), cpus_.end(),
198 [&](CPU_data& d) { d.InitializeReduce(cols, cols); });
199
200#pragma omp parallel for num_threads(gpus_.size())
201 for (Index i = 0; i < Index(gpus_.size()); i++) {
202 GPU_data& gpu = gpus_[i];
203 gpu.activateGPU();
204 gpu.push_back(rows, 1);
205 gpu.push_back(rows, cols);
206 gpu.push_back(rows, cols);
207 gpu.push_back(cols, cols);
208 gpu.temp.back()->setZero();
209 }
210}
211#else
213 std::for_each(cpus_.begin(), cpus_.end(),
214 [&](CPU_data& d) { d.InitializeReduce(cols, cols); });
215}
216#endif
217
218void OpenMP_CUDA::PushMatrix(const Eigen::MatrixXd& matrix,
219 Index OpenmpThread) {
220 Index parentid = getParentThreadId(OpenmpThread);
221 Index threadid = getLocalThreadId(parentid);
222#ifdef USE_CUDA
223 if (isGPUthread(parentid)) {
224 GPU_data& gpu = gpus_[threadid];
225 gpu.activateGPU();
226 gpu.Mat(1).copy_to_gpu(matrix);
227 } else {
228 cpus_[threadid].ref_mat = matrix;
229 }
230#else
231 cpus_[threadid].ref_mat = matrix;
232#endif
233}
234
235void OpenMP_CUDA::A_TDA(const Eigen::VectorXd& vec, Index OpenmpThread) {
236 Index parentid = getParentThreadId(OpenmpThread);
237 Index threadid = getLocalThreadId(parentid);
238 auto cpucomp = [&]() {
239 CPU_data& cpu = cpus_[threadid];
240 cpu.reduce() +=
241 cpu.ref_mat().transpose() * vec.asDiagonal() * cpu.ref_mat();
242 };
243#ifdef USE_CUDA
244 if (isGPUthread(parentid)) {
245 GPU_data& gpu = gpus_[threadid];
246 gpu.activateGPU();
247 gpu.Mat(0).copy_to_gpu(vec);
248 gpu.pipe().diag_gemm(gpu.Mat(1).transpose(), gpu.Mat(0), gpu.Mat(2));
249 gpu.pipe().gemm(gpu.Mat(1).transpose(), gpu.Mat(2), gpu.Mat(3), 1.0);
250 } else {
251 cpucomp();
252 }
253#else
254 cpucomp();
255#endif
256}
257
258#ifdef USE_CUDA
259void OpenMP_CUDA::createTemporaries(const Eigen::VectorXd& vec,
260 const Eigen::MatrixXd& input, Index rows1,
261 Index rows2, Index cols) {
262
263 std::for_each(cpus_.begin(), cpus_.end(), [&](CPU_data& d) {
264 d.InitializeReduce(input.rows(), input.cols());
265 d.InitializeVec(input.rows());
266 });
267
268 rOP_ = input;
269 vec_ = vec;
270#pragma omp parallel for num_threads(gpus_.size())
271 for (Index i = 0; i < Index(gpus_.size()); i++) {
272 GPU_data& gpu = gpus_[i];
273 gpu.activateGPU();
274 gpu.push_back(vec);
275 gpu.push_back(input);
276 gpu.push_back(rows1, cols);
277 gpu.push_back(rows2, cols);
278 gpu.push_back(rows1 * rows2, 1);
279 gpu.push_back(rows1 * rows2, 1);
280 gpu.push_back(input.rows(), input.cols());
281 gpu.temp.back()->setZero();
282 }
283}
284#else
285void OpenMP_CUDA::createTemporaries(const Eigen::VectorXd& vec,
286 const Eigen::MatrixXd& input, Index, Index,
287 Index) {
288
289 std::for_each(cpus_.begin(), cpus_.end(), [&](CPU_data& d) {
290 d.InitializeReduce(input.rows(), input.cols());
291 d.InitializeVec(input.rows());
292 });
293
294 rOP_ = input;
295 vec_ = vec;
296}
297#endif
298
299void OpenMP_CUDA::PrepareMatrix1(Eigen::MatrixXd& mat, Index OpenmpThread) {
300 Index parentid = getParentThreadId(OpenmpThread);
301 Index threadid = getLocalThreadId(parentid);
302#ifdef USE_CUDA
303 if (isGPUthread(parentid)) {
304 GPU_data& gpu = gpus_[threadid];
305 gpu.activateGPU();
306 gpu.Mat(2).copy_to_gpu(mat);
307 gpu.pipe().diag_gemm(gpu.Mat(2), gpu.Mat(0), gpu.Mat(2));
308 } else {
309 cpus_[threadid].ref_mat = mat;
310 mat *= vec_().asDiagonal();
311 }
312#else
313 cpus_[threadid].ref_mat = mat;
314 mat *= vec_().asDiagonal();
315#endif
316}
317
318void OpenMP_CUDA::SetTempZero(Index OpenmpThread) {
319 Index parentid = getParentThreadId(OpenmpThread);
320 Index threadid = getLocalThreadId(parentid);
321#ifdef USE_CUDA
322 if (isGPUthread(parentid)) {
323 GPU_data& gpu = gpus_[threadid];
324 gpu.activateGPU();
325 gpu.Mat(4).setZero();
326 } else {
327 cpus_[threadid].temp_vec.setZero();
328 }
329#else
330 cpus_[threadid].temp_vec.setZero();
331#endif
332}
333
334void OpenMP_CUDA::PrepareMatrix2(const Eigen::Block<const Eigen::MatrixXd>& mat,
335 bool Hd2, Index OpenmpThread) {
336 Index parentid = getParentThreadId(OpenmpThread);
337 Index threadid = getLocalThreadId(parentid);
338 auto cpucomp = [&]() {
339 CPU_data& cpu = cpus_[threadid];
340 if (Hd2) {
341 Eigen::Map<Eigen::MatrixXd> row(cpu.temp_vec.data(), mat.rows(),
342 cpu.ref_mat().rows());
343 row += mat * cpus_[threadid].ref_mat().transpose();
344 } else {
345 Eigen::Map<Eigen::MatrixXd> row(cpu.temp_vec.data(), cpu.ref_mat().rows(),
346 mat.rows());
347 row += cpu.ref_mat() * mat.transpose();
348 }
349 };
350#ifdef USE_CUDA
351 if (isGPUthread(parentid)) {
352 GPU_data& gpu = gpus_[threadid];
353 gpu.activateGPU();
354 gpu.Mat(3).copy_to_gpu(mat);
355 if (Hd2) {
356 gpu.Mat(4).reshape(gpu.Mat(3).rows(), gpu.Mat(2).rows());
357 gpu.pipe().gemm(gpu.Mat(3), gpu.Mat(2).transpose(), gpu.Mat(4), 1.0);
358 } else {
359 gpu.Mat(4).reshape(gpu.Mat(2).rows(), gpu.Mat(3).rows());
360 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4), 1.0);
361 }
362 gpu.Mat(4).reshape(gpu.Mat(2).rows() * gpu.Mat(3).rows(), 1);
363 } else {
364 cpucomp();
365 }
366#else
367 cpucomp();
368#endif
369}
370
371void OpenMP_CUDA::Addvec(const Eigen::VectorXd& row, Index OpenmpThread) {
372 Index parentid = getParentThreadId(OpenmpThread);
373 Index threadid = getLocalThreadId(parentid);
374#ifdef USE_CUDA
375 if (isGPUthread(parentid)) {
376 GPU_data& gpu = gpus_[threadid];
377 gpu.activateGPU();
378 gpu.Mat(5).copy_to_gpu(row);
379 gpu.pipe().axpy(gpu.Mat(5), gpu.Mat(4), 1.0);
380 } else {
381 cpus_[threadid].temp_vec += row;
382 }
383#else
384 cpus_[threadid].temp_vec += row;
385#endif
386}
387
388void OpenMP_CUDA::MultiplyRow(Index row, Index OpenmpThread) {
389 Index parentid = getParentThreadId(OpenmpThread);
390 Index threadid = getLocalThreadId(parentid);
391 auto cpucomp = [&]() {
392 cpus_[threadid].reduce().row(row) =
393 cpus_[threadid].temp_vec.transpose() * rOP_();
394 };
395#ifdef USE_CUDA
396 if (isGPUthread(parentid)) {
397 GPU_data& gpu = gpus_[threadid];
398 gpu.activateGPU();
399 gpu.pipe().gemm(gpu.Mat(4).transpose(), gpu.Mat(1), gpu.Mat(6).row(row),
400 0.0);
401 } else {
402 cpucomp();
403 }
404#else
405 cpucomp();
406#endif
407}
408
409#ifdef USE_CUDA
411
412#pragma omp parallel for num_threads(gpus_.size())
413 for (Index i = 0; i < Index(gpus_.size()); i++) {
414 GPU_data& gpu = gpus_[i];
415 gpu.activateGPU();
416 gpu.resize(2, rows, cols);
417 gpu.resize(3, rows, cols);
418 gpu.resize(4, rows, rows);
419 }
420}
421#else
423#endif
424
425void OpenMP_CUDA::PushMatrix1(const Eigen::MatrixXd& mat, Index OpenmpThread) {
426
427 Index parentid = getParentThreadId(OpenmpThread);
428 Index threadid = getLocalThreadId(parentid);
429#ifdef USE_CUDA
430 if (isGPUthread(parentid)) {
431 GPU_data& gpu = gpus_[threadid];
432 gpu.activateGPU();
433 gpu.Mat(2).copy_to_gpu(mat);
434 } else {
435 cpus_[threadid].ref_mat = mat;
436 }
437#else
438 cpus_[threadid].ref_mat = mat;
439#endif
440}
441
442void OpenMP_CUDA::MultiplyBlocks(const Eigen::Block<const Eigen::MatrixXd>& mat,
443 Index i1, Index i2, Index OpenmpThread) {
444 Index parentid = getParentThreadId(OpenmpThread);
445 Index threadid = getLocalThreadId(parentid);
446 auto cpucomp = [&]() {
447 CPU_data& cpu = cpus_[threadid];
448 const Eigen::MatrixXd block = cpu.ref_mat() * mat.transpose();
449 cpu.reduce().middleRows(i1 * block.rows(), block.rows()) +=
450 block * rOP_().middleRows(i2 * block.rows(), block.rows());
451 if (i1 != i2) {
452 cpu.reduce().middleRows(i2 * block.rows(), block.rows()) +=
453 block.transpose() *
454 rOP_().middleRows(i1 * block.rows(), block.rows());
455 }
456 };
457#ifdef USE_CUDA
458 if (isGPUthread(parentid)) {
459 GPU_data& gpu = gpus_[threadid];
460 gpu.activateGPU();
461 gpu.Mat(3).copy_to_gpu(mat);
462 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4));
463 Index blocksize = gpu.Mat(4).rows();
464 gpu.pipe().gemm(gpu.Mat(4),
465 gpu.Mat(1).middleRows(i2 * blocksize, blocksize),
466 gpu.Mat(6).middleRows(i1 * blocksize, blocksize), 1.0);
467 if (i1 != i2) {
468 gpu.pipe().gemm(gpu.Mat(4).transpose(),
469 gpu.Mat(1).middleRows(i1 * blocksize, blocksize),
470 gpu.Mat(6).middleRows(i2 * blocksize, blocksize), 1.0);
471 }
472 } else {
473 cpucomp();
474 }
475#else
476 cpucomp();
477#endif
478}
479
481#ifdef USE_CUDA
482#pragma omp parallel for num_threads(gpus_.size())
483 for (Index i = 0; i < Index(gpus_.size()); i++) {
484 GPU_data& gpu = gpus_[i];
485 gpu.activateGPU();
486 cpus_[i].reduce() = *(gpu.temp.back());
487 }
488#endif
489 for (Index i = 1; i < Index(cpus_.size()); i++) {
490 cpus_[0].reduce() += cpus_[i].reduce();
491 }
492 return cpus_[0].reduce();
493}
494
495} // namespace xtp
496} // namespace votca
void A_TDA(const Eigen::VectorXd &vec, Index OpenmpThread)
void Addvec(const Eigen::VectorXd &row, Index OpenmpThread)
Index getParentThreadId(Index OpenmpThreadId) const
void PushMatrix1(const Eigen::MatrixXd &mat, Index OpenmpThread)
void MultiplyLeftRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
std::vector< CPU_data > cpus_
Eigen::MatrixXd getReductionVar()
void SetTempZero(Index OpenmpThread)
void MultiplyRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
void createTemporaries(Index rows, Index cols)
void setOperators(const std::vector< Eigen::MatrixXd > &tensor, const Eigen::MatrixXd &rightoperator)
void PushMatrix(const Eigen::MatrixXd &mat, Index OpenmpThread)
void MultiplyBlocks(const Eigen::Block< const Eigen::MatrixXd > &mat, Index i1, Index i2, Index OpenmpThread)
static Index AvailableGPUs()
void MultiplyRow(Index row, Index OpenmpThread)
void createAdditionalTemporaries(Index rows, Index cols)
void PrepareMatrix2(const Eigen::Block< const Eigen::MatrixXd > &mat, bool Hd2, Index OpenmpThread)
static void SetNoGPUs(Index number)
Index getLocalThreadId(Index ParentThreadId) const
static Index UsingGPUs()
DefaultReference< const Eigen::MatrixXd > rOP_
DefaultReference< const Eigen::MatrixXd > lOP_
static Index number_of_gpus
Index getNumberThreads() const
DefaultReference< const Eigen::VectorXd > vec_
void PrepareMatrix1(Eigen::MatrixXd &mat, Index OpenmpThread)
Index getMaxThreads()
Definition eigen.h:128
Index getThreadId()
Definition eigen.h:143
bool InsideActiveParallelRegion()
Definition eigen.h:136
Index count_available_gpus()
Definition cudamatrix.cc:65
base class for all analysis tools
Definition basebead.h:33
Eigen::Index Index
Definition types.h:26
DefaultReference< Eigen::MatrixXd > ref_mat