openmp__cuda_8cc_source.html

/*

 *            Copyright 2009-2020 The VOTCA Development Team

 *                       (http://www.votca.org)

 *

 *      Licensed under the Apache License, Version 2.0 (the "License")

 *

 * You may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *              http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 *

 */


// Local VOTCA includes

#include "votca/xtp/openmp_cuda.h"


namespace votca {

namespace xtp {


// Has to be declared because of

// https://stackoverflow.com/questions/9110487/undefined-reference-to-a-static-member

Index OpenMP_CUDA::number_of_gpus = 0;


Index OpenMP_CUDA::UsingGPUs() { return number_of_gpus; }


Index OpenMP_CUDA::AvailableGPUs() {

#ifdef USE_CUDA

  return count_available_gpus();

#else

  return 0;

#endif

}


void OpenMP_CUDA::SetNoGPUs(Index number) {

  if (number < 0 || number > AvailableGPUs()) {

    number_of_gpus = AvailableGPUs();

  } else {

    number_of_gpus = number;

  }

}


OpenMP_CUDA::OpenMP_CUDA() {


  inside_Parallel_region_ = OPENMP::InsideActiveParallelRegion();

  threadID_parent_ = OPENMP::getThreadId();


  cpus_.resize(getNumberThreads());


#ifdef USE_CUDA

  Index no_gpus = UsingGPUs();

  gpus_.clear();

  if (inside_Parallel_region_) {

    if (threadID_parent_ < no_gpus) {

      gpus_.push_back(GPU_data(threadID_parent_));

    }

  } else {

    if (no_gpus > getNumberThreads()) {

      no_gpus = getNumberThreads();

    }

    for (Index i = 0; i < no_gpus; i++) {

      gpus_.push_back(GPU_data(i));

    }

  }

#endif

}


#ifdef USE_CUDA

void OpenMP_CUDA::setOperators(const std::vector<Eigen::MatrixXd>& tensor,

                               const Eigen::MatrixXd& rightoperator) {

  rOP_ = rightoperator;


#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    const Eigen::MatrixXd& head = tensor.front();

    gpu.push_back(head.rows(), head.cols());

    gpu.push_back(rightoperator);

    gpu.push_back(head.rows(), rightoperator.cols());

  }

}

#else


void OpenMP_CUDA::setOperators(const std::vector<Eigen::MatrixXd>&,

                               const Eigen::MatrixXd& rightoperator) {

  rOP_ = rightoperator;

}


#endif


#ifdef USE_CUDA

bool OpenMP_CUDA::isInVector(Index Id, const std::vector<GPU_data>& vec) {

  return (std::find_if(vec.begin(), vec.end(), [&Id](const GPU_data& d) {

            return d.Id == Id;

          }) != vec.end());

}

bool OpenMP_CUDA::isGPUthread(Index ParentThreadId) const {

  return isInVector(ParentThreadId, gpus_);

}

#endif


Index OpenMP_CUDA::getParentThreadId(Index OpenmpThreadId) const {

  return inside_Parallel_region_ ? threadID_parent_ : OpenmpThreadId;

}


Index OpenMP_CUDA::getLocalThreadId(Index ParentThreadId) const {

  return inside_Parallel_region_ ? 0 : ParentThreadId;

}


Index OpenMP_CUDA::getNumberThreads() const {

  return inside_Parallel_region_ ? 1 : OPENMP::getMaxThreads();

}


/*

 * The Cuda device behaves like a server that is receiving matrix-matrix

 * multiplications from a single stream (an Nvidia queue) and handle them

 * in an asynchronous way. It performs the following operations when recieving

 * a request:

 *  1. Check that there is enough space for the arrays

 *  2. Allocate memory for each matrix

 *  3. Copy the matrix to the allocated space

 *  4. Perform the matrix multiplication

 *  5. Return the result matrix

 * The Cuda device knows to which memory address it needs to copy back the

 * result. see: https://docs.nvidia.com/cuda/cublas/index.html#thread-safety2

 */


#ifdef USE_CUDA

void OpenMP_CUDA::MultiplyRight(Eigen::MatrixXd& tensor, Index OpenmpThread) {


  Index threadid = getParentThreadId(OpenmpThread);

  if (isGPUthread(threadid)) {

    GPU_data& gpu = gpus_[getLocalThreadId(threadid)];

    gpu.activateGPU();

    gpu.Mat(0).copy_to_gpu(tensor);

    gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));

    tensor = gpu.Mat(2);

  } else {

    tensor *= rOP_();

  }

  return;

}


#else


void OpenMP_CUDA::MultiplyRight(Eigen::MatrixXd& tensor, Index) {

  tensor *= rOP_();

}


#endif


void OpenMP_CUDA::setOperators(const Eigen::MatrixXd& leftoperator,

                               const Eigen::MatrixXd& rightoperator) {

  lOP_ = leftoperator;

  rOP_ = rightoperator;


#ifdef USE_CUDA

#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    gpu.push_back(leftoperator);

    gpu.push_back(leftoperator.cols(), rightoperator.rows());

    gpu.push_back(leftoperator.rows(), rightoperator.rows());

    gpu.push_back(rightoperator);

    gpu.push_back(leftoperator.rows(), rightoperator.cols());

  }

#endif

}


#ifdef USE_CUDA

void OpenMP_CUDA::MultiplyLeftRight(Eigen::MatrixXd& matrix,

                                    Index OpenmpThread) {


  Index threadid = getParentThreadId(OpenmpThread);

  if (isGPUthread(threadid)) {

    GPU_data& gpu = gpus_[getLocalThreadId(threadid)];

    gpu.activateGPU();

    gpu.Mat(1).copy_to_gpu(matrix);

    gpu.pipe().gemm(gpu.Mat(0), gpu.Mat(1), gpu.Mat(2));

    gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3), gpu.Mat(4));

    matrix = gpu.Mat(4);

  } else {

    matrix = lOP_() * matrix * rOP_();

  }

  return;

}

#else


void OpenMP_CUDA::MultiplyLeftRight(Eigen::MatrixXd& matrix, Index) {

  matrix = lOP_() * matrix * rOP_();

}


#endif


#ifdef USE_CUDA

void OpenMP_CUDA::createTemporaries(Index rows, Index cols) {


  std::for_each(cpus_.begin(), cpus_.end(),

                [&](CPU_data& d) { d.InitializeReduce(cols, cols); });


#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    gpu.push_back(rows, 1);

    gpu.push_back(rows, cols);

    gpu.push_back(rows, cols);

    gpu.push_back(cols, cols);

    gpu.temp.back()->setZero();

  }

}

#else


void OpenMP_CUDA::createTemporaries(Index, Index cols) {

  std::for_each(cpus_.begin(), cpus_.end(),

                [&](CPU_data& d) { d.InitializeReduce(cols, cols); });

}


#endif


void OpenMP_CUDA::PushMatrix(const Eigen::MatrixXd& matrix,

                             Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(1).copy_to_gpu(matrix);

  } else {

    cpus_[threadid].ref_mat = matrix;

  }

#else

  cpus_[threadid].ref_mat = matrix;

#endif

}


void OpenMP_CUDA::A_TDA(const Eigen::VectorXd& vec, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

  auto cpucomp = [&]() {

    CPU_data& cpu = cpus_[threadid];

    cpu.reduce() +=

        cpu.ref_mat().transpose() * vec.asDiagonal() * cpu.ref_mat();

  };

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(0).copy_to_gpu(vec);

    gpu.pipe().diag_gemm(gpu.Mat(1).transpose(), gpu.Mat(0), gpu.Mat(2));

    gpu.pipe().gemm(gpu.Mat(1).transpose(), gpu.Mat(2), gpu.Mat(3), 1.0);

  } else {

    cpucomp();

  }

#else

  cpucomp();

#endif

}


#ifdef USE_CUDA

void OpenMP_CUDA::createTemporaries(const Eigen::VectorXd& vec,

                                    const Eigen::MatrixXd& input, Index rows1,

                                    Index rows2, Index cols) {


  std::for_each(cpus_.begin(), cpus_.end(), [&](CPU_data& d) {

    d.InitializeReduce(input.rows(), input.cols());

    d.InitializeVec(input.rows());

  });


  rOP_ = input;

  vec_ = vec;

#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    gpu.push_back(vec);

    gpu.push_back(input);

    gpu.push_back(rows1, cols);

    gpu.push_back(rows2, cols);

    gpu.push_back(rows1 * rows2, 1);

    gpu.push_back(rows1 * rows2, 1);

    gpu.push_back(input.rows(), input.cols());

    gpu.temp.back()->setZero();

  }

}

#else


void OpenMP_CUDA::createTemporaries(const Eigen::VectorXd& vec,

                                    const Eigen::MatrixXd& input, Index, Index,

                                    Index) {


  std::for_each(cpus_.begin(), cpus_.end(), [&](CPU_data& d) {

    d.InitializeReduce(input.rows(), input.cols());

    d.InitializeVec(input.rows());

  });


  rOP_ = input;

  vec_ = vec;

}


#endif


void OpenMP_CUDA::PrepareMatrix1(Eigen::MatrixXd& mat, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(2).copy_to_gpu(mat);

    gpu.pipe().diag_gemm(gpu.Mat(2), gpu.Mat(0), gpu.Mat(2));

  } else {

    cpus_[threadid].ref_mat = mat;

    mat *= vec_().asDiagonal();

  }

#else

  cpus_[threadid].ref_mat = mat;

  mat *= vec_().asDiagonal();

#endif

}


void OpenMP_CUDA::SetTempZero(Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(4).setZero();

  } else {

    cpus_[threadid].temp_vec.setZero();

  }

#else

  cpus_[threadid].temp_vec.setZero();

#endif

}


void OpenMP_CUDA::PrepareMatrix2(const Eigen::Block<const Eigen::MatrixXd>& mat,

                                 bool Hd2, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

  auto cpucomp = [&]() {

    CPU_data& cpu = cpus_[threadid];

    if (Hd2) {

      Eigen::Map<Eigen::MatrixXd> row(cpu.temp_vec.data(), mat.rows(),

                                      cpu.ref_mat().rows());

      row += mat * cpus_[threadid].ref_mat().transpose();

    } else {

      Eigen::Map<Eigen::MatrixXd> row(cpu.temp_vec.data(), cpu.ref_mat().rows(),

                                      mat.rows());

      row += cpu.ref_mat() * mat.transpose();

    }

  };

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(3).copy_to_gpu(mat);

    if (Hd2) {

      gpu.Mat(4).reshape(gpu.Mat(3).rows(), gpu.Mat(2).rows());

      gpu.pipe().gemm(gpu.Mat(3), gpu.Mat(2).transpose(), gpu.Mat(4), 1.0);

    } else {

      gpu.Mat(4).reshape(gpu.Mat(2).rows(), gpu.Mat(3).rows());

      gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4), 1.0);

    }

    gpu.Mat(4).reshape(gpu.Mat(2).rows() * gpu.Mat(3).rows(), 1);

  } else {

    cpucomp();

  }

#else

  cpucomp();

#endif

}


void OpenMP_CUDA::Addvec(const Eigen::VectorXd& row, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(5).copy_to_gpu(row);

    gpu.pipe().axpy(gpu.Mat(5), gpu.Mat(4), 1.0);

  } else {

    cpus_[threadid].temp_vec += row;

  }

#else

  cpus_[threadid].temp_vec += row;

#endif

}


void OpenMP_CUDA::MultiplyRow(Index row, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

  auto cpucomp = [&]() {

    cpus_[threadid].reduce().row(row) =

        cpus_[threadid].temp_vec.transpose() * rOP_();

  };

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.pipe().gemm(gpu.Mat(4).transpose(), gpu.Mat(1), gpu.Mat(6).row(row),

                    0.0);

  } else {

    cpucomp();

  }

#else

  cpucomp();

#endif

}


#ifdef USE_CUDA

void OpenMP_CUDA::createAdditionalTemporaries(Index rows, Index cols) {


#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    gpu.resize(2, rows, cols);

    gpu.resize(3, rows, cols);

    gpu.resize(4, rows, rows);

  }

}

#else

void OpenMP_CUDA::createAdditionalTemporaries(Index, Index) { ; }

#endif


void OpenMP_CUDA::PushMatrix1(const Eigen::MatrixXd& mat, Index OpenmpThread) {


  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(2).copy_to_gpu(mat);

  } else {

    cpus_[threadid].ref_mat = mat;

  }

#else

  cpus_[threadid].ref_mat = mat;

#endif

}


void OpenMP_CUDA::MultiplyBlocks(const Eigen::Block<const Eigen::MatrixXd>& mat,

                                 Index i1, Index i2, Index OpenmpThread) {

  Index parentid = getParentThreadId(OpenmpThread);

  Index threadid = getLocalThreadId(parentid);

  auto cpucomp = [&]() {

    CPU_data& cpu = cpus_[threadid];

    const Eigen::MatrixXd block = cpu.ref_mat() * mat.transpose();

    cpu.reduce().middleRows(i1 * block.rows(), block.rows()) +=

        block * rOP_().middleRows(i2 * block.rows(), block.rows());

    if (i1 != i2) {

      cpu.reduce().middleRows(i2 * block.rows(), block.rows()) +=

          block.transpose() *

          rOP_().middleRows(i1 * block.rows(), block.rows());

    }

  };

#ifdef USE_CUDA

  if (isGPUthread(parentid)) {

    GPU_data& gpu = gpus_[threadid];

    gpu.activateGPU();

    gpu.Mat(3).copy_to_gpu(mat);

    gpu.pipe().gemm(gpu.Mat(2), gpu.Mat(3).transpose(), gpu.Mat(4));

    Index blocksize = gpu.Mat(4).rows();

    gpu.pipe().gemm(gpu.Mat(4),

                    gpu.Mat(1).middleRows(i2 * blocksize, blocksize),

                    gpu.Mat(6).middleRows(i1 * blocksize, blocksize), 1.0);

    if (i1 != i2) {

      gpu.pipe().gemm(gpu.Mat(4).transpose(),

                      gpu.Mat(1).middleRows(i1 * blocksize, blocksize),

                      gpu.Mat(6).middleRows(i2 * blocksize, blocksize), 1.0);

    }

  } else {

    cpucomp();

  }

#else

  cpucomp();

#endif

}


Eigen::MatrixXd OpenMP_CUDA::getReductionVar() {

#ifdef USE_CUDA

#pragma omp parallel for num_threads(gpus_.size())

  for (Index i = 0; i < Index(gpus_.size()); i++) {

    GPU_data& gpu = gpus_[i];

    gpu.activateGPU();

    cpus_[i].reduce() = *(gpu.temp.back());

  }

#endif

  for (Index i = 1; i < Index(cpus_.size()); i++) {

    cpus_[0].reduce() += cpus_[i].reduce();

  }

  return cpus_[0].reduce();

}


}  // namespace xtp

}  // namespace votca

votca::xtp::OpenMP_CUDA::A_TDA
void A_TDA(const Eigen::VectorXd &vec, Index OpenmpThread)
Definition openmp_cuda.cc:235

votca::xtp::OpenMP_CUDA::Addvec
void Addvec(const Eigen::VectorXd &row, Index OpenmpThread)
Definition openmp_cuda.cc:371

votca::xtp::OpenMP_CUDA::getParentThreadId
Index getParentThreadId(Index OpenmpThreadId) const
Definition openmp_cuda.cc:106

votca::xtp::OpenMP_CUDA::PushMatrix1
void PushMatrix1(const Eigen::MatrixXd &mat, Index OpenmpThread)
Definition openmp_cuda.cc:425

votca::xtp::OpenMP_CUDA::MultiplyLeftRight
void MultiplyLeftRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
Definition openmp_cuda.cc:189

votca::xtp::OpenMP_CUDA::cpus_
std::vector< CPU_data > cpus_
Definition openmp_cuda.h:149

votca::xtp::OpenMP_CUDA::getReductionVar
Eigen::MatrixXd getReductionVar()
Definition openmp_cuda.cc:480

votca::xtp::OpenMP_CUDA::SetTempZero
void SetTempZero(Index OpenmpThread)
Definition openmp_cuda.cc:318

votca::xtp::OpenMP_CUDA::MultiplyRight
void MultiplyRight(Eigen::MatrixXd &matrix, Index OpenmpThread)
Definition openmp_cuda.cc:147

votca::xtp::OpenMP_CUDA::createTemporaries
void createTemporaries(Index rows, Index cols)
Definition openmp_cuda.cc:212

votca::xtp::OpenMP_CUDA::setOperators
void setOperators(const std::vector< Eigen::MatrixXd > &tensor, const Eigen::MatrixXd &rightoperator)
Definition openmp_cuda.cc:89

votca::xtp::OpenMP_CUDA::PushMatrix
void PushMatrix(const Eigen::MatrixXd &mat, Index OpenmpThread)
Definition openmp_cuda.cc:218

votca::xtp::OpenMP_CUDA::MultiplyBlocks
void MultiplyBlocks(const Eigen::Block< const Eigen::MatrixXd > &mat, Index i1, Index i2, Index OpenmpThread)
Definition openmp_cuda.cc:442

votca::xtp::OpenMP_CUDA::AvailableGPUs
static Index AvailableGPUs()
Definition openmp_cuda.cc:32

votca::xtp::OpenMP_CUDA::OpenMP_CUDA
OpenMP_CUDA()
Definition openmp_cuda.cc:48

votca::xtp::OpenMP_CUDA::MultiplyRow
void MultiplyRow(Index row, Index OpenmpThread)
Definition openmp_cuda.cc:388

votca::xtp::OpenMP_CUDA::inside_Parallel_region_
bool inside_Parallel_region_
Definition openmp_cuda.h:151

votca::xtp::OpenMP_CUDA::createAdditionalTemporaries
void createAdditionalTemporaries(Index rows, Index cols)
Definition openmp_cuda.cc:422

votca::xtp::OpenMP_CUDA::PrepareMatrix2
void PrepareMatrix2(const Eigen::Block< const Eigen::MatrixXd > &mat, bool Hd2, Index OpenmpThread)
Definition openmp_cuda.cc:334

votca::xtp::OpenMP_CUDA::SetNoGPUs
static void SetNoGPUs(Index number)
Definition openmp_cuda.cc:40

votca::xtp::OpenMP_CUDA::threadID_parent_
Index threadID_parent_
Definition openmp_cuda.h:152

votca::xtp::OpenMP_CUDA::getLocalThreadId
Index getLocalThreadId(Index ParentThreadId) const
Definition openmp_cuda.cc:109

votca::xtp::OpenMP_CUDA::UsingGPUs
static Index UsingGPUs()
Definition openmp_cuda.cc:30

votca::xtp::OpenMP_CUDA::rOP_
DefaultReference< const Eigen::MatrixXd > rOP_
Definition openmp_cuda.h:130

votca::xtp::OpenMP_CUDA::lOP_
DefaultReference< const Eigen::MatrixXd > lOP_
Definition openmp_cuda.h:131

votca::xtp::OpenMP_CUDA::number_of_gpus
static Index number_of_gpus
Definition openmp_cuda.h:154

votca::xtp::OpenMP_CUDA::getNumberThreads
Index getNumberThreads() const
Definition openmp_cuda.cc:112

votca::xtp::OpenMP_CUDA::vec_
DefaultReference< const Eigen::VectorXd > vec_
Definition openmp_cuda.h:132

votca::xtp::OpenMP_CUDA::PrepareMatrix1
void PrepareMatrix1(Eigen::MatrixXd &mat, Index OpenmpThread)
Definition openmp_cuda.cc:299

votca::xtp::OPENMP::getMaxThreads
Index getMaxThreads()
Definition eigen.h:128

votca::xtp::OPENMP::getThreadId
Index getThreadId()
Definition eigen.h:143

votca::xtp::OPENMP::InsideActiveParallelRegion
bool InsideActiveParallelRegion()
Definition eigen.h:136

votca::xtp
Charge transport classes.
Definition ERIs.h:28

votca::xtp::count_available_gpus
Index count_available_gpus()
Definition cudamatrix.cc:65

votca
base class for all analysis tools
Definition basebead.h:33

votca::Index
Eigen::Index Index
Definition types.h:26

openmp_cuda.h

votca::xtp::OpenMP_CUDA::CPU_data
Definition openmp_cuda.h:134

votca::xtp::OpenMP_CUDA::CPU_data::ref_mat
DefaultReference< Eigen::MatrixXd > ref_mat
Definition openmp_cuda.h:143

votca::xtp::OpenMP_CUDA::CPU_data::reduce
Eigen::MatrixXd & reduce()
Definition openmp_cuda.h:136

votca::xtp::OpenMP_CUDA::CPU_data::temp_vec
Eigen::VectorXd temp_vec
Definition openmp_cuda.h:145