Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Kokkos_CrsMatrix_MP_Vector_Cuda.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
43#define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
44
45#if defined( __CUDACC__)
46
48#include "Kokkos_Core.hpp"
50
51//----------------------------------------------------------------------------
52// Specializations of Kokkos::CrsMatrix for Sacado::MP::Vector scalar type
53// and Cuda device
54//----------------------------------------------------------------------------
55
56namespace Stokhos {
57
58namespace details {
59
60// We can make things slightly faster for the ensemble multiply kernel with
61// NumPerThread == 1 by using at most 32 registers per thread for 100%
62// occupancy, which we get with these launch bounds on Kepler. For
63// NumPerThread > 1 it is worse though due to too many spilled registers.
64template <typename Kernel>
65__global__ void
66#if __CUDA_ARCH__ >= 300
68#endif
69FullOccupancyKernelLaunch(Kernel kernel) {
70 kernel();
71}
72
73// Kernel implementing y = A * x for Cuda device where
74// A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
75// x, y == Kokkos::View< Sacado::MP::Vector<...>*,...>,
76// x and y are rank 1
77// We spell everything out here to make sure the ranks and devices match.
78//
79// This implementation uses the underlying 2-D view directly.
80// Currently only works for statically sized MP::Vector
81template <typename MatrixStorage,
82 typename MatrixOrdinal,
83 typename MatrixMemory,
84 typename MatrixSize,
85 typename InputStorage,
86 typename ... InputP,
87 typename OutputStorage,
88 typename ... OutputP,
89 typename Update>
90class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
91 MatrixOrdinal,
92 Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space>,
93 MatrixMemory,
94 MatrixSize>,
95 Kokkos::View< const Sacado::MP::Vector<InputStorage>*,
96 InputP... >,
97 Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
98 OutputP... >,
99 Update,
100 void
101 >
102{
103public:
104
105 typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
106 typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
107 typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
108
109 typedef Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space> Device;
110 typedef typename Device::execution_space execution_space;
111 typedef typename execution_space::size_type size_type;
112
113 typedef KokkosSparse::CrsMatrix<const MatrixValue,
114 MatrixOrdinal,
115 Device,
116 MatrixMemory,
117 MatrixSize> matrix_type;
118 typedef typename matrix_type::values_type matrix_values_type;
119 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
120 typedef Kokkos::View< const InputVectorValue*,
121 InputP... > input_vector_type;
122 typedef Kokkos::View< OutputVectorValue*,
123 OutputP... > output_vector_type;
124 typedef Update update_type;
125
126 // Multiply that uses shared memory for coalesced access of sparse
127 // graph row offsets and column indices and Rank-1 vectors
128 template <unsigned NumPerThread>
129 struct Kernel {
130 typedef typename output_vector_type::value_type output_vector_value;
131 typedef typename output_vector_value::value_type scalar_type;
132
133 const matrix_graph_type m_Agraph;
134 const matrix_values_type m_Avals;
135 const input_vector_type m_x;
136 const output_vector_type m_y;
137 const update_type m_update;
138 const size_type m_row_count;
139
140 Kernel( const matrix_type & A,
141 const input_vector_type & x,
142 const output_vector_type & y,
143 const update_type& update )
144 : m_Agraph( A.graph )
145 , m_Avals( A.values )
146 , m_x( x )
147 , m_y( y )
148 , m_update( update )
149 , m_row_count( A.graph.row_map.extent(0)-1 )
150 {}
151
152 __device__
153 inline void operator()(void) const
154 {
155 volatile size_type * const sh_row =
156 kokkos_impl_cuda_shared_memory<size_type>();
157 volatile size_type * const sh_col = sh_row + blockDim.y+1;
158
159 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
160 const size_type nid = blockDim.x*blockDim.y;
161
162 const size_type block_row = blockDim.y*blockIdx.x;
163
164 // Read blockDim.y+1 row offsets in coalesced manner
165 const size_type num_row =
166 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
167 m_row_count+1 - block_row;
168 for (size_type i=tid; i<num_row; i+=nid)
169 sh_row[i] = m_Agraph.row_map[block_row+i];
170 __syncthreads();
171
172 const size_type iRow = block_row + threadIdx.y;
173 if (iRow < m_row_count) {
174 scalar_type sum[NumPerThread];
175 const size_type iEntryBegin = sh_row[threadIdx.y];
176 const size_type iEntryEnd = sh_row[threadIdx.y+1];
177
178 for (size_type e=0; e<NumPerThread; ++e)
179 sum[e] = 0;
180
181 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
182 col_block+=blockDim.x) {
183 const size_type num_col =
184 col_block+blockDim.x <= iEntryEnd ?
185 blockDim.x : iEntryEnd-col_block;
186
187 // Read blockDim.x entries column indices at a time to maintain
188 // coalesced accesses (don't need __syncthreads() assuming
189 // blockDim.x <= warp_size
190 // Note: it might be a little faster if we ensured aligned access
191 // to m_A.graph.entries() and m_A.values() below.
192 if (threadIdx.x < num_col)
193 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
194 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
195 __syncthreads();
196
197 for ( size_type col = 0; col < num_col; ++col ) {
198 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
199
200 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
201 ++e, ee+=blockDim.x) {
202 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
203 m_x(iCol).fastAccessCoeff(ee);
204 }
205 }
206
207 }
208
209 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
210 ++e, ee+=blockDim.x) {
211 m_update( m_y(iRow).fastAccessCoeff(ee), sum[e] );
212 }
213 }
214 } // operator()
215 };
216
217 static void apply( const matrix_type & A,
218 const input_vector_type & x,
219 const output_vector_type & y,
220 const update_type & update )
221 {
222 // Compute CUDA block and grid sizes.
223 //
224 // For Kepler, the best block size appears to be 256 threads with
225 // 16 threads per vector for double precision, yielding 16 rows per
226 // block. Due to register usage, this gives 64 or 48 warps per SM
227 // and thus 8 or 6 blocks per SM. We use these values by default if
228 // the user-specified block dimensions are zero
229
230 const size_type value_dimension = Kokkos::dimension_scalar(x);
231
232 size_type threads_per_vector = A.dev_config.block_dim.x;
233 if (threads_per_vector == 0)
234 threads_per_vector = value_dimension ;
235 size_type rows_per_block = A.dev_config.block_dim.y;
236 if (rows_per_block == 0)
237 rows_per_block = 256 / threads_per_vector;
238 const size_type row_count = A.graph.row_map.extent(0)-1;
239 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
240 const dim3 block( threads_per_vector, rows_per_block, 1 );
241 const dim3 grid( num_blocks, 1 );
242
243 // Check threads_per_vector evenly divides number of vector entries
244 size_type num_per_thread = value_dimension / threads_per_vector;
245 TEUCHOS_TEST_FOR_EXCEPTION(
246 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
247 "Entries/thread * threads/vector must equal number of vector entries");
248
249 // Check threads_per_vector is not greater than warp size (kernels assume
250 // this)
251 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
252 TEUCHOS_TEST_FOR_EXCEPTION(
253 threads_per_vector > warp_size, std::logic_error,
254 "Threads/vector cannont exceed Cuda warp size");
255
256 // Launch kernel based on static number of entries per thread
257 if (num_per_thread == 1) {
258 launch_impl<1>( A, x, y, update, block, grid );
259 }
260 else if (num_per_thread == 2) {
261 launch_impl<2>( A, x, y, update, block, grid );
262 }
263 else if (num_per_thread == 3) {
264 launch_impl<3>( A, x, y, update, block, grid );
265 }
266 else if (num_per_thread == 4) {
267 launch_impl<4>( A, x, y, update, block, grid );
268 }
269 else
270 TEUCHOS_TEST_FOR_EXCEPTION(
271 true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
272 }
273
274private:
275
276 // Function to launch our kernel, templated on number of entries per thread
277 // and shared memory choice
278 template <unsigned num_per_thread>
279 static void launch_impl( const matrix_type & A,
280 const input_vector_type & x,
281 const output_vector_type & y,
282 const update_type & update,
283 dim3 block,
284 dim3 grid)
285 {
286 typedef Kernel<num_per_thread> Krnl;
287
288 // Use this to check occupancy, 64 is 100% on Kepler
289 const bool occupancy_check = false;
290 if (occupancy_check) {
291 DeviceProp device_prop;
292 size_type warps_per_sm;
293 if (num_per_thread == 1)
294 warps_per_sm = device_prop.get_resident_warps_per_sm(
295 FullOccupancyKernelLaunch<Krnl>);
296 else
297 warps_per_sm = device_prop.get_resident_warps_per_sm(
298 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
299 std::cout << "warps_per_sm = " << warps_per_sm
300 << " max_warps_per_sm = " << device_prop.max_warps_per_sm
301 << std::endl;
302 }
303
304 const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
305 if (sizeof(size_type) <= 4)
306 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
307 else
308 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
309
310 // Launch
311 if (num_per_thread == 1)
312 FullOccupancyKernelLaunch<<< grid, block, shared >>>
313 ( Krnl( A, x, y, update ) );
314 else
315 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
316 ( Krnl( A, x, y, update ) );
317 }
318};
319
320// Kernel implementing y = A * x for Cuda device where
321// A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
322// x, y == Kokkos::View< Sacado::MP::Vector<...>**,...>,
323// x and y are rank 2
324// We spell everything out here to make sure the ranks and devices match.
325//
326// This implementation uses the underlying 2-D view directly.
327// Currently only works for statically sized MP::Vector
328template <typename MatrixStorage,
329 typename MatrixOrdinal,
330 typename MatrixMemory,
331 typename MatrixSize,
332 typename InputStorage,
333 typename ... InputP,
334 typename OutputStorage,
335 typename ... OutputP,
336 typename Update>
337class MPMultiply< KokkosSparse::CrsMatrix<const Sacado::MP::Vector<MatrixStorage>,
338 MatrixOrdinal,
339 Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space>,
340 MatrixMemory,
341 MatrixSize>,
342 Kokkos::View< const Sacado::MP::Vector<InputStorage>**,
343 InputP... >,
344 Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
345 OutputP... >,
346 Update,
347 void
348 >
349{
350public:
351
352 typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
353 typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
354 typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
355
356 typedef Kokkos::Device<Kokkos::Cuda, typename Kokkos::Cuda::memory_space> Device;
357 typedef typename Device::execution_space execution_space;
358 typedef typename execution_space::size_type size_type;
359
360
361 typedef KokkosSparse::CrsMatrix<const MatrixValue,
362 MatrixOrdinal,
363 Device,
364 MatrixMemory,
365 MatrixSize> matrix_type;
366 typedef typename matrix_type::values_type matrix_values_type;
367 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
368 typedef Kokkos::View< const InputVectorValue**,
369 InputP... > input_vector_type;
370 typedef Kokkos::View< OutputVectorValue**,
371 OutputP... > output_vector_type;
372 typedef Update update_type;
373
374 // Specialization that uses shared memory for coalesced access of sparse
375 // graph row offsets and column indices and Rank-2 vectors
376 template <unsigned NumPerThread>
377 struct Kernel {
378 typedef typename output_vector_type::value_type output_vector_value;
379 typedef typename output_vector_value::value_type scalar_type;
380
381 const matrix_graph_type m_Agraph;
382 const matrix_values_type m_Avals;
383 const input_vector_type m_x;
384 const output_vector_type m_y;
385 const update_type m_update;
386 const size_type m_row_count;
387 const size_type m_num_vec_cols;
388
389 Kernel( const matrix_type & A,
390 const input_vector_type & x,
391 const output_vector_type & y,
392 const update_type& update )
393 : m_Agraph( A.graph )
394 , m_Avals( A.values )
395 , m_x( x )
396 , m_y( y )
397 , m_update( update )
398 , m_row_count( A.graph.row_map.extent(0)-1 )
399 , m_num_vec_cols( x.extent(1) )
400 {}
401
402 __device__
403 inline void operator()(void) const
404 {
405 volatile size_type * const sh_row =
406 kokkos_impl_cuda_shared_memory<size_type>();
407 volatile size_type * const sh_col = sh_row + blockDim.y+1;
408
409 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
410 const size_type nid = blockDim.x*blockDim.y;
411
412 const size_type block_row = blockDim.y*blockIdx.x;
413
414 // Read blockDim.y+1 row offsets in coalesced manner
415 const size_type num_row =
416 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
417 m_row_count+1 - block_row;
418 for (size_type i=tid; i<num_row; i+=nid)
419 sh_row[i] = m_Agraph.row_map[block_row+i];
420 __syncthreads();
421
422 const size_type iRow = block_row + threadIdx.y;
423 if (iRow < m_row_count) {
424 scalar_type sum[NumPerThread];
425 const size_type iEntryBegin = sh_row[threadIdx.y];
426 const size_type iEntryEnd = sh_row[threadIdx.y+1];
427
428 // We could potentially improve performance by putting this loop
429 // inside the matrix column loop, however that would require storing
430 // an array for sum over vector columns, which would require shared
431 // memory.
432 for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
433
434 for (size_type e=0; e<NumPerThread; ++e)
435 sum[e] = 0;
436
437 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
438 col_block+=blockDim.x) {
439 const size_type num_col =
440 col_block+blockDim.x <= iEntryEnd ?
441 blockDim.x : iEntryEnd-col_block;
442
443 // Read blockDim.x entries column indices at a time to maintain
444 // coalesced accesses (don't need __syncthreads() assuming
445 // blockDim.x <= warp_size
446 // Note: it might be a little faster if we ensured aligned access
447 // to m_A.graph.entries() and m_A.values() below.
448 if (threadIdx.x < num_col)
449 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
450 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
451 __syncthreads();
452
453 for ( size_type col = 0; col < num_col; ++col ) {
454 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
455
456 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
457 ++e, ee+=blockDim.x) {
458 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
459 m_x(iCol, vec_col).fastAccessCoeff(ee);
460 }
461 }
462
463 }
464
465 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
466 ++e, ee+=blockDim.x) {
467 m_update( m_y(iRow, vec_col).fastAccessCoeff(ee), sum[e] );
468 }
469
470 }
471 }
472 } // operator()
473 };
474
475 static void apply( const matrix_type & A,
476 const input_vector_type & x,
477 const output_vector_type & y,
478 const update_type & update )
479 {
480 // Compute CUDA block and grid sizes.
481 //
482 // For Kepler, the best block size appears to be 256 threads with
483 // 16 threads per vector for double precision, yielding 16 rows per
484 // block. Due to register usage, this gives 64 or 48 warps per SM
485 // and thus 8 or 6 blocks per SM. We use these values by default if
486 // the user-specified block dimensions are zero
487
488 const size_type value_dimension = dimension_scalar(x);
489
490 size_type threads_per_vector = A.dev_config.block_dim.x;
491 if (threads_per_vector == 0)
492 threads_per_vector = value_dimension ;
493 size_type rows_per_block = A.dev_config.block_dim.y;
494 if (rows_per_block == 0)
495 rows_per_block = 256 / threads_per_vector;
496 const size_type row_count = A.graph.row_map.extent(0)-1;
497 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
498 const dim3 block( threads_per_vector, rows_per_block, 1 );
499 const dim3 grid( num_blocks, 1 );
500
501 // Check threads_per_vector evenly divides number of vector entries
502 size_type num_per_thread = value_dimension / threads_per_vector;
503 TEUCHOS_TEST_FOR_EXCEPTION(
504 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
505 "Entries/thread * threads/vector must equal number of vector entries");
506
507 // Launch kernel based on static number of entries per thread
508 if (num_per_thread == 1) {
509 launch_impl<1>( A, x, y, update, block, grid );
510 }
511 else if (num_per_thread == 2) {
512 launch_impl<2>( A, x, y, update, block, grid );
513 }
514 else if (num_per_thread == 3) {
515 launch_impl<3>( A, x, y, update, block, grid );
516 }
517 else if (num_per_thread == 4) {
518 launch_impl<4>( A, x, y, update, block, grid );
519 }
520 else
521 TEUCHOS_TEST_FOR_EXCEPTION(
522 true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
523 }
524
525private:
526
527 // Function to launch our kernel, templated on number of entries per thread
528 // and shared memory choice
529 template <unsigned num_per_thread>
530 static void launch_impl( const matrix_type & A,
531 const input_vector_type & x,
532 const output_vector_type & y,
533 const update_type & update,
534 dim3 block,
535 dim3 grid)
536 {
537 typedef Kernel<num_per_thread> Krnl;
538
539 // Use this to check occupancy, 64 is 100% on Kepler
540 const bool occupancy_check = false;
541 if (occupancy_check) {
542 DeviceProp device_prop;
543 size_type warps_per_sm;
544 if (num_per_thread == 1)
545 warps_per_sm = device_prop.get_resident_warps_per_sm(
546 FullOccupancyKernelLaunch<Krnl>);
547 else
548 warps_per_sm = device_prop.get_resident_warps_per_sm(
549 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
550 std::cout << "warps_per_sm = " << warps_per_sm
551 << " max_warps_per_sm = " << device_prop.max_warps_per_sm
552 << std::endl;
553 }
554
555 const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
556 if (sizeof(size_type) <= 4)
557 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
558 else
559 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
560
561 // Launch
562 if (num_per_thread == 1)
563 FullOccupancyKernelLaunch<<< grid, block, shared >>>
564 ( Krnl( A, x, y, update ) );
565 else
566 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
567 ( Krnl( A, x, y, update ) );
568 }
569};
570
571} // namespace details
572
573} // namespace Stokhos
574
575#endif /* #if defined( __CUDACC__) */
576
577#endif /* #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP */
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j) - expr2.val(j)
Kokkos::DefaultExecutionSpace execution_space
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
Top-level namespace for Stokhos classes and functions.
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition csr_vector.h:265
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition csr_vector.h:267