59 Kokkos::View< VectorValue** , Kokkos::LayoutLeft , Kokkos::Cuda > ,
90 const size_type blockCount = m_A.graph.row_map.extent(0) - 1 ;
93 iBlock < blockCount ; iBlock += gridDim.x ) {
96 const size_type iEntryEnd = m_A.graph.row_map[iBlock+1];
97 size_type iEntry = m_A.graph.row_map[iBlock];
99 for ( ; iEntry < iEntryEnd ; ++iEntry ) {
100 const VectorValue *
const x = & m_x( 0 , m_A.graph.entries(iEntry) );
101 const MatrixValue *
const a = & m_A.values( 0 , iEntry );
106 if ( threadIdx.x + blockDim.x * threadIdx.y < m_A.block.dimension() ) {
107 m_y(threadIdx.x,iBlock) = y ;
117 Kokkos::Impl::cuda_internal_maximum_warp_count() * Kokkos::Impl::CudaTraits::WarpSize ;
119 const size_type row_count = A.graph.row_map.extent(0) - 1 ;
122 std::min( row_count , Kokkos::Impl::cuda_internal_maximum_grid_count()[0] ) , 1 , 1 );
128 if ( thread_max < block.x * block.y ) {
129 std::ostringstream msg ;
130 msg <<
"Kokkos::Impl::Multiply< BlockCrsMatrix< Block , Value , Cuda > , ... >"
131 <<
" ERROR: block dimension = " << block.x * block.y
132 <<
" > " << thread_max <<
"== maximum Cuda threads per block" ;
133 throw std::runtime_error(msg.str());
136 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid , block , shmem >>>(
Multiply(A,x,y) );