Sacado Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
mat_vec/driver.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Sacado Package
5// Copyright (2006) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// This library is free software; you can redistribute it and/or modify
11// it under the terms of the GNU Lesser General Public License as
12// published by the Free Software Foundation; either version 2.1 of the
13// License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23// USA
24// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25// (etphipp@sandia.gov).
26//
27// ***********************************************************************
28// @HEADER
29
30// A performance test that computes the derivative of a simple Kokkos kernel
31// using various Fad classes
32
33#include "mat_vec.hpp"
36
37#include "Sacado.hpp"
38
39#include "Teuchos_CommandLineProcessor.hpp"
40#include "Teuchos_StandardCatchMacros.hpp"
41
42// For vtune
43#include <sys/types.h>
44#include <unistd.h>
45#include <algorithm>
46
47void
48print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
49 const std::string& name)
50{
51 std::cout << name << "\t "
52 << perf.time << "\t "
53 << perf.throughput << "\t "
54 << perf.time / perf_base.time
55 << std::endl;
56}
57
58template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
59 typename ... ViewArgs>
60void
61do_times(const size_t m,
62 const size_t n,
63 const size_t p,
64 const size_t nloop,
65 const bool value,
66 const bool analytic,
67 const bool sfad,
68 const bool slfad,
69 const bool dfad,
70 const bool flat,
71 const bool hierarchical,
72 const bool check)
73{
74 Perf perf_value;
75 perf_value.time = 1.0;
76
77 // Run value
78 if (value) {
79 try {
80 Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
81 perf_value = perf;
82 print_perf(perf, perf_value, p, "Value ");
83 }
84 catch(std::exception& e) {
85 std::cout << e.what() << std::endl;
86 }
87 }
88
89 // Run analytic
90 if (analytic) {
91 try {
92 Perf perf =
93 do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
94 print_perf(perf, perf_value, p, "Analytic ");
95 }
96 catch(std::exception& e) {
97 std::cout << e.what() << std::endl;
98 }
99 }
100 if(analytic && p == SFadSize) {
101 try {
102 Perf perf =
103 do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
104 print_perf(perf, perf_value, p, "Analytic-s");
105 }
106 catch(std::exception& e) {
107 std::cout << e.what() << std::endl;
108 }
109 }
110 if(analytic && p <= SLFadSize) {
111 try {
112 Perf perf =
113 do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
114 print_perf(perf, perf_value, p, "Analytic-sl");
115 }
116 catch(std::exception& e) {
117 std::cout << e.what() << std::endl;
118 }
119 }
120
121 // Run flat SFad
122 if (flat && sfad && p == SFadSize) {
123 try {
124 Perf perf =
125 do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
126 print_perf(perf, perf_value, p, "SFad ");
127 }
128 catch(std::exception& e) {
129 std::cout << e.what() << std::endl;
130 }
131 }
132
133 // Run flat SLFad
134 if (flat && slfad && p <= SLFadSize) {
135 try {
136 Perf perf =
137 do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
138 print_perf(perf, perf_value, p, "SLFad ");
139 }
140 catch(std::exception& e) {
141 std::cout << e.what() << std::endl;
142 }
143 }
144
145 // Run flat DFad
146 if (flat && dfad) {
147 try {
148 Perf perf =
149 do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
150 print_perf(perf, perf_value, p, "DFad ");
151 }
152 catch(std::exception& e) {
153 std::cout << e.what() << std::endl;
154 }
155 try {
156 Perf perf_scratch =
157 do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
158 print_perf(perf_scratch, perf_value, p, "DFad Scratch");
159 }
160 catch(std::exception& e) {
161 std::cout << e.what() << std::endl;
162 }
163 }
164
165 // Run hierarchical SFad
166 if (hierarchical && sfad && p == HierSFadSize) {
167 try {
168 Perf perf =
169 do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,check);
170 print_perf(perf, perf_value, p, "H. SFad ");
171 }
172 catch(std::exception& e) {
173 std::cout << e.what() << std::endl;
174 }
175 }
176
177 // Run hierarchical SLFad
178 if (hierarchical && slfad && p <= HierSLFadSize) {
179 try {
180 Perf perf =
181 do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,check);
182 print_perf(perf, perf_value, p, "H. SLFad ");
183 }
184 catch(std::exception& e) {
185 std::cout << e.what() << std::endl;
186 }
187 }
188
189 // Run hierarchical DFad
190 if (hierarchical && dfad) {
191 try {
192 Perf perf =
193 do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
194 print_perf(perf, perf_value, p, "H. DFad ");
195 }
196 catch(std::exception& e) {
197 std::cout << e.what() << std::endl;
198 }
199 try {
200 Perf perf_scratch =
201 do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
202 print_perf(perf_scratch, perf_value, p, "H. DFad Scratch");
203 }
204 catch(std::exception& e) {
205 std::cout << e.what() << std::endl;
206 }
207 }
208
209}
210
216const int num_layout_types = 3;
219const char *layout_names[] = { "left", "right", "default" };
220
221template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
222 typename Device>
223void
224do_times_layout(const size_t m,
225 const size_t n,
226 const size_t p,
227 const size_t nloop,
228 const bool value,
229 const bool analytic,
230 const bool sfad,
231 const bool slfad,
232 const bool dfad,
233 const bool flat,
234 const bool hierarchical,
235 const bool check,
236 const LayoutType& layout,
237 const std::string& device)
238{
239 int prec = 2;
240 std::cout.setf(std::ios::scientific);
241 std::cout.precision(prec);
242 std::cout << std::endl
243 << device
244 << " performance for layout "
245 << layout_names[layout]
246 << " m = " << m << " n = " << n << " p = " << p
247 << std::endl << std::endl;
248 std::cout << "Computation \t Time \t Throughput \t Ratio" << std::endl;
249
250 if (layout == LAYOUT_LEFT)
251 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
252 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
253 else if (layout == LAYOUT_RIGHT)
254 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
255 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
256 else
257 do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
258 (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
259}
260
261// Connect executable to vtune for profiling
263 std::stringstream cmd;
264 pid_t my_os_pid=getpid();
265 const std::string vtune_loc =
266 "amplxe-cl";
267 const std::string output_dir = "./vtune";
268 cmd << vtune_loc
269 << " -collect hotspots -result-dir " << output_dir
270 << " -target-pid " << my_os_pid << " &";
271 std::cout << cmd.str() << std::endl;
272 system(cmd.str().c_str());
273 system("sleep 10");
274}
275
276int main(int argc, char* argv[]) {
277 Kokkos::initialize(argc,argv);
278
279 bool success = true;
280 try {
281
282 // Set up command line options
283 Teuchos::CommandLineProcessor clp(false);
284 clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
285 int m = 100000;
286 clp.setOption("m", &m, "Number of matrix rows");
287 int n = 100;
288 clp.setOption("n", &n, "Number of matrix columns");
289 int p = SFadSize;
290 clp.setOption("p", &p, "Number of derivative components");
291 int nloop = 10;
292 clp.setOption("nloop", &nloop, "Number of loops");
293#ifdef KOKKOS_ENABLE_SERIAL
294 bool serial = 0;
295 clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
296#endif
297#ifdef KOKKOS_ENABLE_OPENMP
298 bool openmp = 0;
299 clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
300#endif
301#ifdef KOKKOS_ENABLE_THREADS
302 bool threads = 0;
303 clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
304#endif
305#ifdef KOKKOS_ENABLE_CUDA
306 bool cuda = 0;
307 clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
308#endif
309 bool print_config = false;
310 clp.setOption("print-config", "no-print-config", &print_config,
311 "Whether to print Kokkos device configuration");
312 LayoutType layout = LAYOUT_DEFAULT;
313 clp.setOption("layout", &layout, num_layout_types, layout_values,
314 layout_names, "View layout");
315 bool vtune = false;
316 clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
317 bool value = true;
318 clp.setOption("value", "no-value", &value, "Run value calculation");
319 bool analytic = true;
320 clp.setOption("analytic", "no-analytic", &analytic,
321 "Run analytic derivative calculation");
322 bool sfad = true;
323 clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
324 bool slfad = true;
325 clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
326 bool dfad = true;
327 clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
328 bool flat = true;
329 clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation");
330 bool hierarchical = true;
331 clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
332 bool check = false;
333 clp.setOption("check", "no-check", &check, "Check calculations are correct");
334
335 // Parse options
336 switch (clp.parse(argc, argv)) {
337 case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
338 return 0;
339 case Teuchos::CommandLineProcessor::PARSE_ERROR:
340 case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
341 return 1;
342 case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
343 break;
344 }
345
346 if (vtune)
348
349 if (print_config)
350 Kokkos::print_configuration(std::cout, true);
351
352#ifdef KOKKOS_ENABLE_SERIAL
353 if (serial) {
354 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
355 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial");
356 }
357#endif
358
359#ifdef KOKKOS_ENABLE_OPENMP
360 if (openmp) {
361 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
362 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP");
363 }
364#endif
365
366#ifdef KOKKOS_ENABLE_THREADS
367 if (threads) {
368 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
369 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads");
370 }
371#endif
372
373#ifdef KOKKOS_ENABLE_CUDA
374 if (cuda) {
375 do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
376 m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda");
377 }
378#endif
379
380 }
381 TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
382
383 Kokkos::finalize();
384
385 return !success;
386}
int main()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
double do_time_analytic(int nderiv, int nloop)
Definition fad_expr.cpp:94
Perf do_time_val(const size_t m, const size_t n, const size_t nloop, const bool check)
const char * p
const int HierSFadSize
const int SFadSize
const int SLFadSize
const int HierSLFadSize
void do_times(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check)
const LayoutType layout_values[]
const char * layout_names[]
void connect_vtune()
const int num_layout_types
void do_times_layout(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool value, const bool analytic, const bool sfad, const bool slfad, const bool dfad, const bool flat, const bool hierarchical, const bool check, const LayoutType &layout, const std::string &device)
void print_perf(const Perf &perf, const Perf &perf_base, const size_t p, const std::string &name)
@ LAYOUT_RIGHT
@ LAYOUT_DEFAULT
@ LAYOUT_LEFT
Perf do_time_analytic_s(const size_t m, const size_t n, const size_t nloop, const bool check)
Definition mat_vec.cpp:477
Perf do_time_analytic_sl(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Definition mat_vec.cpp:438
double time
double throughput