66 for (
Index i = 0; i < no_gpus; i++) {
67 gpus_.push_back(GPU_data(i));
75 const Eigen::MatrixXd& rightoperator) {
78#pragma omp parallel for num_threads(gpus_.size())
79 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
80 GPU_data& gpu = gpus_[i];
82 const Eigen::MatrixXd& head = tensor.front();
83 gpu.push_back(head.rows(), head.cols());
84 gpu.push_back(rightoperator);
85 gpu.push_back(head.rows(), rightoperator.cols());
90 const Eigen::MatrixXd& rightoperator) {
96bool OpenMP_CUDA::isInVector(
Index Id,
const std::vector<GPU_data>& vec) {
97 return (std::find_if(vec.begin(), vec.end(), [&Id](
const GPU_data& d) {
101bool OpenMP_CUDA::isGPUthread(
Index ParentThreadId)
const {
102 return isInVector(ParentThreadId, gpus_);
134 if (isGPUthread(threadid)) {
137 gpu.Mat(0).copy_to_gpu(tensor);
138 gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));
153 const Eigen::MatrixXd& rightoperator) {
155 rOP_ = rightoperator;
158#pragma omp parallel for num_threads(gpus_.size())
159 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
160 GPU_data& gpu = gpus_[i];
162 gpu.push_back(leftoperator);
163 gpu.push_back(leftoperator.cols(), rightoperator.rows());
164 gpu.push_back(leftoperator.rows(), rightoperator.rows());
165 gpu.push_back(rightoperator);
166 gpu.push_back(leftoperator.rows(), rightoperator.cols());
173 Index OpenmpThread) {
176 if (isGPUthread(threadid)) {
179 gpu.Mat(1).copy_to_gpu(matrix);
180 gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));
181 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3), gpu.Mat(4));
198 [&](CPU_data& d) { d.InitializeReduce(cols, cols); });
200#pragma omp parallel for num_threads(gpus_.size())
201 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
202 GPU_data& gpu = gpus_[i];
204 gpu.push_back(rows, 1);
205 gpu.push_back(rows, cols);
206 gpu.push_back(rows, cols);
207 gpu.push_back(cols, cols);
208 gpu.temp.back()->setZero();
214 [&](
CPU_data& d) { d.InitializeReduce(cols, cols); });
219 Index OpenmpThread) {
223 if (isGPUthread(parentid)) {
224 GPU_data& gpu = gpus_[threadid];
226 gpu.Mat(1).copy_to_gpu(matrix);
228 cpus_[threadid].ref_mat = matrix;
231 cpus_[threadid].ref_mat = matrix;
238 auto cpucomp = [&]() {
244 if (isGPUthread(parentid)) {
245 GPU_data& gpu = gpus_[threadid];
247 gpu.Mat(0).copy_to_gpu(vec);
248 gpu.pipe().diag_gemm(gpu.Mat(1).transpose(), gpu.Mat(0), gpu.Mat(2));
249 gpu.pipe().gemm(gpu.Mat(1).transpose(), gpu.Mat(2), gpu.Mat(3), 1.0);
260 const Eigen::MatrixXd& input,
Index rows1,
263 std::for_each(
cpus_.begin(),
cpus_.end(), [&](CPU_data& d) {
264 d.InitializeReduce(input.rows(), input.cols());
265 d.InitializeVec(input.rows());
270#pragma omp parallel for num_threads(gpus_.size())
271 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
272 GPU_data& gpu = gpus_[i];
275 gpu.push_back(input);
276 gpu.push_back(rows1, cols);
277 gpu.push_back(rows2, cols);
278 gpu.push_back(rows1 * rows2, 1);
279 gpu.push_back(rows1 * rows2, 1);
280 gpu.push_back(input.rows(), input.cols());
281 gpu.temp.back()->setZero();
290 d.InitializeReduce(input.rows(), input.cols());
291 d.InitializeVec(input.rows());
303 if (isGPUthread(parentid)) {
304 GPU_data& gpu = gpus_[threadid];
306 gpu.Mat(2).copy_to_gpu(mat);
307 gpu.pipe().diag_gemm(gpu.Mat(2), gpu.Mat(0), gpu.Mat(2));
309 cpus_[threadid].ref_mat = mat;
310 mat *=
vec_().asDiagonal();
313 cpus_[threadid].ref_mat = mat;
314 mat *=
vec_().asDiagonal();
322 if (isGPUthread(parentid)) {
323 GPU_data& gpu = gpus_[threadid];
325 gpu.Mat(4).setZero();
327 cpus_[threadid].temp_vec.setZero();
330 cpus_[threadid].temp_vec.setZero();
335 bool Hd2,
Index OpenmpThread) {
338 auto cpucomp = [&]() {
341 Eigen::Map<Eigen::MatrixXd> row(cpu.
temp_vec.data(), mat.rows(),
343 row += mat *
cpus_[threadid].ref_mat().transpose();
345 Eigen::Map<Eigen::MatrixXd> row(cpu.
temp_vec.data(), cpu.
ref_mat().rows(),
347 row += cpu.
ref_mat() * mat.transpose();
351 if (isGPUthread(parentid)) {
352 GPU_data& gpu = gpus_[threadid];
354 gpu.Mat(3).copy_to_gpu(mat);
356 gpu.Mat(4).reshape(gpu.Mat(3).rows(), gpu.Mat(2).rows());
357 gpu.pipe().gemm(gpu.Mat(3), gpu.Mat(2).transpose(), gpu.Mat(4), 1.0);
359 gpu.Mat(4).reshape(gpu.Mat(2).rows(), gpu.Mat(3).rows());
360 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4), 1.0);
362 gpu.Mat(4).reshape(gpu.Mat(2).rows() * gpu.Mat(3).rows(), 1);
375 if (isGPUthread(parentid)) {
376 GPU_data& gpu = gpus_[threadid];
378 gpu.Mat(5).copy_to_gpu(row);
379 gpu.pipe().axpy(gpu.Mat(5), gpu.Mat(4), 1.0);
381 cpus_[threadid].temp_vec += row;
384 cpus_[threadid].temp_vec += row;
391 auto cpucomp = [&]() {
392 cpus_[threadid].reduce().row(row) =
393 cpus_[threadid].temp_vec.transpose() *
rOP_();
396 if (isGPUthread(parentid)) {
397 GPU_data& gpu = gpus_[threadid];
399 gpu.pipe().gemm(gpu.Mat(4).transpose(), gpu.Mat(1), gpu.Mat(6).row(row),
412#pragma omp parallel for num_threads(gpus_.size())
413 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
414 GPU_data& gpu = gpus_[i];
416 gpu.resize(2, rows, cols);
417 gpu.resize(3, rows, cols);
418 gpu.resize(4, rows, rows);
430 if (isGPUthread(parentid)) {
431 GPU_data& gpu = gpus_[threadid];
433 gpu.Mat(2).copy_to_gpu(mat);
435 cpus_[threadid].ref_mat = mat;
438 cpus_[threadid].ref_mat = mat;
446 auto cpucomp = [&]() {
448 const Eigen::MatrixXd block = cpu.
ref_mat() * mat.transpose();
449 cpu.
reduce().middleRows(i1 * block.rows(), block.rows()) +=
450 block *
rOP_().middleRows(i2 * block.rows(), block.rows());
452 cpu.
reduce().middleRows(i2 * block.rows(), block.rows()) +=
454 rOP_().middleRows(i1 * block.rows(), block.rows());
458 if (isGPUthread(parentid)) {
459 GPU_data& gpu = gpus_[threadid];
461 gpu.Mat(3).copy_to_gpu(mat);
462 gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4));
463 Index blocksize = gpu.Mat(4).rows();
464 gpu.pipe().gemm(gpu.Mat(4),
465 gpu.Mat(1).middleRows(i2 * blocksize, blocksize),
466 gpu.Mat(6).middleRows(i1 * blocksize, blocksize), 1.0);
468 gpu.pipe().gemm(gpu.Mat(4).transpose(),
469 gpu.Mat(1).middleRows(i1 * blocksize, blocksize),
470 gpu.Mat(6).middleRows(i2 * blocksize, blocksize), 1.0);
482#pragma omp parallel for num_threads(gpus_.size())
483 for (
Index i = 0; i <
Index(gpus_.size()); i++) {
484 GPU_data& gpu = gpus_[i];
486 cpus_[i].reduce() = *(gpu.temp.back());
492 return cpus_[0].reduce();
void A_TDA(const Eigen::VectorXd &vec, Index OpenmpThread)
void Addvec(const Eigen::VectorXd &row, Index OpenmpThread)
Index getParentThreadId(Index OpenmpThreadId) const
void PushMatrix1(const Eigen::MatrixXd &mat, Index OpenmpThread)
void MultiplyLeftRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
std::vector< CPU_data > cpus_
Eigen::MatrixXd getReductionVar()
void SetTempZero(Index OpenmpThread)
void MultiplyRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
void createTemporaries(Index rows, Index cols)
void setOperators(const std::vector< Eigen::MatrixXd > &tensor, const Eigen::MatrixXd &rightoperator)
void PushMatrix(const Eigen::MatrixXd &mat, Index OpenmpThread)
void MultiplyBlocks(const Eigen::Block< const Eigen::MatrixXd > &mat, Index i1, Index i2, Index OpenmpThread)
static Index AvailableGPUs()
void MultiplyRow(Index row, Index OpenmpThread)
bool inside_Parallel_region_
void createAdditionalTemporaries(Index rows, Index cols)
void PrepareMatrix2(const Eigen::Block< const Eigen::MatrixXd > &mat, bool Hd2, Index OpenmpThread)
static void SetNoGPUs(Index number)
Index getLocalThreadId(Index ParentThreadId) const
DefaultReference< const Eigen::MatrixXd > rOP_
DefaultReference< const Eigen::MatrixXd > lOP_
static Index number_of_gpus
Index getNumberThreads() const
DefaultReference< const Eigen::VectorXd > vec_
void PrepareMatrix1(Eigen::MatrixXd &mat, Index OpenmpThread)
bool InsideActiveParallelRegion()
Index count_available_gpus()
base class for all analysis tools
DefaultReference< Eigen::MatrixXd > ref_mat
Eigen::MatrixXd & reduce()