MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_RepartitionFactory_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#ifndef MUELU_REPARTITIONFACTORY_DEF_HPP
47#define MUELU_REPARTITIONFACTORY_DEF_HPP
48
49#include <algorithm>
50#include <iostream>
51#include <sstream>
52
53#include "MueLu_RepartitionFactory_decl.hpp" // TMP JG NOTE: before other includes, otherwise I cannot test the fwd declaration in _def
54
55#ifdef HAVE_MPI
56#include <Teuchos_DefaultMpiComm.hpp>
57#include <Teuchos_CommHelpers.hpp>
58#include <Teuchos_Details_MpiTypeTraits.hpp>
59
60#include <Xpetra_Map.hpp>
61#include <Xpetra_MapFactory.hpp>
62#include <Xpetra_MultiVectorFactory.hpp>
63#include <Xpetra_VectorFactory.hpp>
64#include <Xpetra_Import.hpp>
65#include <Xpetra_ImportFactory.hpp>
66#include <Xpetra_Export.hpp>
67#include <Xpetra_ExportFactory.hpp>
68#include <Xpetra_Matrix.hpp>
69#include <Xpetra_MatrixFactory.hpp>
70
71#include "MueLu_Utilities.hpp"
72
73#include "MueLu_CloneRepartitionInterface.hpp"
74
75#include "MueLu_Level.hpp"
76#include "MueLu_MasterList.hpp"
77#include "MueLu_Monitor.hpp"
78#include "MueLu_PerfUtils.hpp"
79
80namespace MueLu {
81
82 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
84 RCP<ParameterList> validParamList = rcp(new ParameterList());
85
86#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
87 SET_VALID_ENTRY("repartition: print partition distribution");
88 SET_VALID_ENTRY("repartition: remap parts");
89 SET_VALID_ENTRY("repartition: remap num values");
90 SET_VALID_ENTRY("repartition: remap accept partition");
91 SET_VALID_ENTRY("repartition: node repartition level");
92#undef SET_VALID_ENTRY
93
94 validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
95 validParamList->set< RCP<const FactoryBase> >("number of partitions", Teuchos::null, "Instance of RepartitionHeuristicFactory.");
96 validParamList->set< RCP<const FactoryBase> >("Partition", Teuchos::null, "Factory of the partition");
97
98 return validParamList;
99 }
100
101 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
103 Input(currentLevel, "A");
104 Input(currentLevel, "number of partitions");
105 Input(currentLevel, "Partition");
106 }
107
108 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
110 FactoryMonitor m(*this, "Build", currentLevel);
111
112 const Teuchos::ParameterList & pL = GetParameterList();
113 // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
114 // TODO (JG): I don't really know if we want to do this.
115 bool remapPartitions = pL.get<bool> ("repartition: remap parts");
116
117 // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
118 RCP<Matrix> A = Get< RCP<Matrix> >(currentLevel, "A");
119 if (A == Teuchos::null) {
120 Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
121 return;
122 }
123 RCP<const Map> rowMap = A->getRowMap();
124 GO indexBase = rowMap->getIndexBase();
125 Xpetra::UnderlyingLib lib = rowMap->lib();
126
127 RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
128 RCP<const Teuchos::Comm<int> > comm = origComm;
129
130 int myRank = comm->getRank();
131 int numProcs = comm->getSize();
132
133 RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
134 TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
135 RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
136
138 int numPartitions = Get<int>(currentLevel, "number of partitions");
139
140 // ======================================================================================================
141 // Construct decomposition vector
142 // ======================================================================================================
143 RCP<GOVector> decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");
144
145 // check which factory provides "Partition"
146 if(remapPartitions == true && Teuchos::rcp_dynamic_cast<const CloneRepartitionInterface>(GetFactory("Partition")) != Teuchos::null) {
147 // if "Partition" is provided by a CloneRepartitionInterface class we have to switch of remapPartitions
148 // as we can assume the processor Ids in Partition to be the expected ones. If we would do remapping we
149 // would get different processors for the different blocks which screws up matrix-matrix multiplication.
150 remapPartitions = false;
151 }
152
153 // check special cases
154 if (numPartitions == 1) {
155 // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
156 // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
157 // in Zoltan[12]Interface).
158 // TODO: We can probably skip more work in this case (like building all extra data structures)
159 GetOStream(Runtime0) << "Only one partition: Skip call to the repartitioner." << std::endl;
160 } else if (numPartitions == -1) {
161 // No repartitioning necessary: decomposition should be Teuchos::null
162 GetOStream(Runtime0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
163 Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
164 return;
165 }
166
167 // If we're doing node away, we need to be sure to get the mapping to the NodeComm's rank 0.
168 const int nodeRepartLevel = pL.get<int> ("repartition: node repartition level");
169 if(currentLevel.GetLevelID() == nodeRepartLevel) {
170 // NodePartitionInterface returns the *ranks* of the guy who gets the info, not the *partition number*
171 // In a sense, we've already done remap here.
172
173 // FIXME: We need a low-comm import construction
174 remapPartitions = false;
175 }
176
177 // ======================================================================================================
178 // Remap if necessary
179 // ======================================================================================================
180 // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
181 // There are two problems, however.
182 // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
183 // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
184 // each partition is the same.
185 // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
186 // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
187 // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
188 // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
189 // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
190 // algorithm can resolve these differently. For instance, running
191 // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
192 // with
193 // <ParameterList name="MueLu">
194 // <Parameter name="coarse: max size" type="int" value="1"/>
195 // <Parameter name="repartition: enable" type="bool" value="true"/>
196 // <Parameter name="repartition: min rows per proc" type="int" value="2"/>
197 // <ParameterList name="level 1">
198 // <Parameter name="repartition: remap parts" type="bool" value="false/true"/>
199 // </ParameterList>
200 // </ParameterList>
201 // produces different repartitioning for level 2.
202 // This different repartitioning may then escalate into different aggregation for the next level.
203 //
204 // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
205 // Fixing (2) is more complicated.
206 // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
207 // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
208 // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
209 // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
210 // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
211 // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
212 // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
213 // MPI_COMM_WORLD, which may lead to issues for logging.
214 if (remapPartitions) {
215 SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);
216
217 bool acceptPartition = pL.get<bool>("repartition: remap accept partition");
218 bool allSubdomainsAcceptPartitions;
219 int localNumAcceptPartition = acceptPartition;
220 int globalNumAcceptPartition;
221 MueLu_sumAll(comm, localNumAcceptPartition, globalNumAcceptPartition);
222 GetOStream(Statistics2) << "Number of ranks that accept partitions: " << globalNumAcceptPartition << std::endl;
223 if (globalNumAcceptPartition < numPartitions) {
224 GetOStream(Warnings0) << "Not enough ranks are willing to accept a partition, allowing partitions on all ranks." << std::endl;
225 acceptPartition = true;
226 allSubdomainsAcceptPartitions = true;
227 } else if (numPartitions > numProcs) {
228 // We are trying to repartition to a larger communicator.
229 allSubdomainsAcceptPartitions = true;
230 } else {
231 allSubdomainsAcceptPartitions = false;
232 }
233
234 DeterminePartitionPlacement(*A, *decomposition, numPartitions, acceptPartition, allSubdomainsAcceptPartitions);
235 }
236
237 // ======================================================================================================
238 // Construct importer
239 // ======================================================================================================
240 // At this point, the following is true:
241 // * Each processors owns 0 or 1 partitions
242 // * If a processor owns a partition, that partition number is equal to the processor rank
243 // * The decomposition vector contains the partitions ids that the corresponding GID belongs to
244
245 ArrayRCP<const GO> decompEntries;
246 if (decomposition->getLocalLength() > 0)
247 decompEntries = decomposition->getData(0);
248
249#ifdef HAVE_MUELU_DEBUG
250 // Test range of partition ids
251 int incorrectRank = -1;
252 for (int i = 0; i < decompEntries.size(); i++)
253 if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
254 incorrectRank = myRank;
255 break;
256 }
257
258 int incorrectGlobalRank = -1;
259 MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
260 TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank >- 1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
261#endif
262
263 Array<GO> myGIDs;
264 myGIDs.reserve(decomposition->getLocalLength());
265
266 // Step 0: Construct mapping
267 // part number -> GIDs I own which belong to this part
268 // NOTE: my own part GIDs are not part of the map
269 typedef std::map<GO, Array<GO> > map_type;
270 map_type sendMap;
271 for (LO i = 0; i < decompEntries.size(); i++) {
272 GO id = decompEntries[i];
273 GO GID = rowMap->getGlobalElement(i);
274
275 if (id == myRank)
276 myGIDs .push_back(GID);
277 else
278 sendMap[id].push_back(GID);
279 }
280 decompEntries = Teuchos::null;
281
282 if (IsPrint(Statistics2)) {
283 GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
284 MueLu_sumAll(comm,numLocalKept, numGlobalKept);
285 GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100*Teuchos::as<double>(numGlobalKept)/numGlobalRows << "%)" << std::endl;
286 }
287
288 int numSend = sendMap.size(), numRecv;
289
290 // Arrayify map keys
291 Array<GO> myParts(numSend), myPart(1);
292 int cnt = 0;
293 myPart[0] = myRank;
294 for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
295 myParts[cnt++] = it->first;
296
297 // Step 1: Find out how many processors send me data
298 // partsIndexBase starts from zero, as the processors ids start from zero
299 GO partsIndexBase = 0;
300 RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
301 RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm);
302 RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);
303
304 RCP<GOVector> partsISend = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIHave);
305 RCP<GOVector> numPartsIRecv = Xpetra::VectorFactory<GO, LO, GO, NO>::Build(partsIOwn);
306 if (numSend) {
307 ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
308 for (int i = 0; i < numSend; i++)
309 partsISendData[i] = 1;
310 }
311 (numPartsIRecv->getDataNonConst(0))[0] = 0;
312
313 numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
314 numRecv = (numPartsIRecv->getData(0))[0];
315
316 // Step 2: Get my GIDs from everybody else
317 MPI_Datatype MpiType = Teuchos::Details::MpiTypeTraits<GO>::getType();
318 int msgTag = 12345; // TODO: use Comm::dup for all internal messaging
319
320 // Post sends
321 Array<MPI_Request> sendReqs(numSend);
322 cnt = 0;
323 for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
324 MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);
325
326 map_type recvMap;
327 size_t totalGIDs = myGIDs.size();
328 for (int i = 0; i < numRecv; i++) {
329 MPI_Status status;
330 MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);
331
332 // Get rank and number of elements from status
333 int fromRank = status.MPI_SOURCE, count;
334 MPI_Get_count(&status, MpiType, &count);
335
336 recvMap[fromRank].resize(count);
337 MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);
338
339 totalGIDs += count;
340 }
341
342 // Do waits on send requests
343 if (numSend) {
344 Array<MPI_Status> sendStatuses(numSend);
345 MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
346 }
347
348 // Merge GIDs
349 myGIDs.reserve(totalGIDs);
350 for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
351 int offset = myGIDs.size(), len = it->second.size();
352 if (len) {
353 myGIDs.resize(offset + len);
354 memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len*sizeof(GO));
355 }
356 }
357 // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
358 // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
359 std::sort(myGIDs.begin(), myGIDs.end());
360
361 // Step 3: Construct importer
362 RCP<Map> newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
363 RCP<const Import> rowMapImporter;
364
365 RCP<const BlockedMap> blockedRowMap = Teuchos::rcp_dynamic_cast<const BlockedMap>(rowMap);
366
367 {
368 SubFactoryMonitor m1(*this, "Import construction", currentLevel);
369 // Generate a nonblocked rowmap if we need one
370 if(blockedRowMap.is_null())
371 rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
372 else {
373 rowMapImporter = ImportFactory::Build(blockedRowMap->getMap(), newRowMap);
374 }
375 }
376
377 // If we're running BlockedCrs we should chop up the newRowMap into a newBlockedRowMap here (and do likewise for importers)
378 if(!blockedRowMap.is_null()) {
379 SubFactoryMonitor m1(*this, "Blocking newRowMap and Importer", currentLevel);
380 RCP<const BlockedMap> blockedTargetMap = MueLu::UtilitiesBase<Scalar,LocalOrdinal,GlobalOrdinal,Node>::GeneratedBlockedTargetMap(*blockedRowMap,*rowMapImporter);
381
382 // NOTE: This code qualifies as "correct but not particularly performant" If this needs to be sped up, we can probably read data from the existing importer to
383 // build sub-importers rather than generating new ones ex nihilo
384 size_t numBlocks = blockedRowMap->getNumMaps();
385 std::vector<RCP<const Import> > subImports(numBlocks);
386
387 for(size_t i=0; i<numBlocks; i++) {
388 RCP<const Map> source = blockedRowMap->getMap(i);
389 RCP<const Map> target = blockedTargetMap->getMap(i);
390 subImports[i] = ImportFactory::Build(source,target);
391 }
392 Set(currentLevel,"SubImporters",subImports);
393 }
394
395
396 Set(currentLevel, "Importer", rowMapImporter);
397
398 // ======================================================================================================
399 // Print some data
400 // ======================================================================================================
401 if (!rowMapImporter.is_null() && IsPrint(Statistics2)) {
402 // int oldRank = SetProcRankVerbose(rebalancedAc->getRowMap()->getComm()->getRank());
403 GetOStream(Statistics2) << PerfUtils::PrintImporterInfo(rowMapImporter, "Importer for rebalancing");
404 // SetProcRankVerbose(oldRank);
405 }
406 if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
407 // Print the grid of processors
408 GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;
409
410 char amActive = (myGIDs.size() ? 1 : 0);
411 std::vector<char> areActive(numProcs, 0);
412 MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);
413
414 int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
415 for (int proc = 0; proc < numProcs; proc += rowWidth) {
416 for (int j = 0; j < rowWidth; j++)
417 if (proc + j < numProcs)
418 GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
419 else
420 GetOStream(Statistics2) << " ";
421
422 GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
423 }
424 }
425
426 } // Build
427
428 //----------------------------------------------------------------------
429 template<typename T, typename W>
430 struct Triplet {
431 T i, j;
432 W v;
433 };
434 template<typename T, typename W>
435 static bool compareTriplets(const Triplet<T,W>& a, const Triplet<T,W>& b) {
436 return (a.v > b.v); // descending order
437 }
438
439 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
441 DeterminePartitionPlacement(const Matrix& A, GOVector& decomposition, GO numPartitions, bool willAcceptPartition, bool allSubdomainsAcceptPartitions) const {
442 RCP<const Map> rowMap = A.getRowMap();
443
444 RCP<const Teuchos::Comm<int> > comm = rowMap->getComm()->duplicate();
445 int numProcs = comm->getSize();
446
447 RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
448 TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
449 RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
450
451 const Teuchos::ParameterList& pL = GetParameterList();
452
453 // maxLocal is a constant which determins the number of largest edges which are being exchanged
454 // The idea is that we do not want to construct the full bipartite graph, but simply a subset of
455 // it, which requires less communication. By selecting largest local edges we hope to achieve
456 // similar results but at a lower cost.
457 const int maxLocal = pL.get<int>("repartition: remap num values");
458 const int dataSize = 2*maxLocal;
459
460 ArrayRCP<GO> decompEntries;
461 if (decomposition.getLocalLength() > 0)
462 decompEntries = decomposition.getDataNonConst(0);
463
464 // Step 1: Sort local edges by weight
465 // Each edge of a bipartite graph corresponds to a triplet (i, j, v) where
466 // i: processor id that has some piece of part with part_id = j
467 // j: part id
468 // v: weight of the edge
469 // We set edge weights to be the total number of nonzeros in rows on this processor which
470 // correspond to this part_id. The idea is that when we redistribute matrix, this weight
471 // is a good approximation of the amount of data to move.
472 // We use two maps, original which maps a partition id of an edge to the corresponding weight,
473 // and a reverse one, which is necessary to sort by edges.
474 std::map<GO,GO> lEdges;
475 if (willAcceptPartition)
476 for (LO i = 0; i < decompEntries.size(); i++)
477 lEdges[decompEntries[i]] += A.getNumEntriesInLocalRow(i);
478
479 // Reverse map, so that edges are sorted by weight.
480 // This results in multimap, as we may have edges with the same weight
481 std::multimap<GO,GO> revlEdges;
482 for (typename std::map<GO,GO>::const_iterator it = lEdges.begin(); it != lEdges.end(); it++)
483 revlEdges.insert(std::make_pair(it->second, it->first));
484
485 // Both lData and gData are arrays of data which we communicate. The data is stored
486 // in pairs, so that data[2*i+0] is the part index, and data[2*i+1] is the corresponding edge weight.
487 // We do not store processor id in data, as we can compute that by looking on the offset in the gData.
488 Array<GO> lData(dataSize, -1), gData(numProcs * dataSize);
489 int numEdges = 0;
490 for (typename std::multimap<GO,GO>::reverse_iterator rit = revlEdges.rbegin(); rit != revlEdges.rend() && numEdges < maxLocal; rit++) {
491 lData[2*numEdges+0] = rit->second; // part id
492 lData[2*numEdges+1] = rit->first; // edge weight
493 numEdges++;
494 }
495
496 // Step 2: Gather most edges
497 // Each processors contributes maxLocal edges by providing maxLocal pairs <part id, weight>, which is of size dataSize
498 MPI_Datatype MpiType = Teuchos::Details::MpiTypeTraits<GO>::getType();
499 MPI_Allgather(static_cast<void*>(lData.getRawPtr()), dataSize, MpiType, static_cast<void*>(gData.getRawPtr()), dataSize, MpiType, *rawMpiComm);
500
501 // Step 3: Construct mapping
502
503 // Construct the set of triplets
504 Teuchos::Array<Triplet<int,int> > gEdges(numProcs * maxLocal);
505 Teuchos::Array<bool> procWillAcceptPartition(numProcs, allSubdomainsAcceptPartitions);
506 size_t k = 0;
507 for (LO i = 0; i < gData.size(); i += 2) {
508 int procNo = i/dataSize; // determine the processor by its offset (since every processor sends the same amount)
509 GO part = gData[i+0];
510 GO weight = gData[i+1];
511 if (part != -1) { // skip nonexistent edges
512 gEdges[k].i = procNo;
513 gEdges[k].j = part;
514 gEdges[k].v = weight;
515 procWillAcceptPartition[procNo] = true;
516 k++;
517 }
518 }
519 gEdges.resize(k);
520
521 // Sort edges by weight
522 // NOTE: compareTriplets is actually a reverse sort, so the edges weight is in decreasing order
523 std::sort(gEdges.begin(), gEdges.end(), compareTriplets<int,int>);
524
525 // Do matching
526 std::map<int,int> match;
527 Teuchos::Array<char> matchedRanks(numProcs, 0), matchedParts(numPartitions, 0);
528 int numMatched = 0;
529 for (typename Teuchos::Array<Triplet<int,int> >::const_iterator it = gEdges.begin(); it != gEdges.end(); it++) {
530 GO rank = it->i;
531 GO part = it->j;
532 if (matchedRanks[rank] == 0 && matchedParts[part] == 0) {
533 matchedRanks[rank] = 1;
534 matchedParts[part] = 1;
535 match[part] = rank;
536 numMatched++;
537 }
538 }
539 GetOStream(Statistics1) << "Number of unassigned partitions before cleanup stage: " << (numPartitions - numMatched) << " / " << numPartitions << std::endl;
540
541 // Step 4: Assign unassigned partitions if necessary.
542 // We do that through desperate matching for remaining partitions:
543 // We select the lowest rank that can still take a partition.
544 // The reason it is done this way is that we don't need any extra communication, as we don't
545 // need to know which parts are valid.
546 if (numPartitions - numMatched > 0) {
547 Teuchos::Array<char> partitionCounts(numPartitions, 0);
548 for (typename std::map<int,int>::const_iterator it = match.begin(); it != match.end(); it++)
549 partitionCounts[it->first] += 1;
550 for (int part = 0, matcher = 0; part < numPartitions; part++) {
551 if (partitionCounts[part] == 0) {
552 // Find first non-matched rank that accepts partitions
553 while (matchedRanks[matcher] || !procWillAcceptPartition[matcher])
554 matcher++;
555
556 match[part] = matcher++;
557 numMatched++;
558 }
559 }
560 }
561
562 TEUCHOS_TEST_FOR_EXCEPTION(numMatched != numPartitions, Exceptions::RuntimeError, "MueLu::RepartitionFactory::DeterminePartitionPlacement: Only " << numMatched << " partitions out of " << numPartitions << " got assigned to ranks.");
563
564 // Step 5: Permute entries in the decomposition vector
565 for (LO i = 0; i < decompEntries.size(); i++)
566 decompEntries[i] = match[decompEntries[i]];
567 }
568
569} // namespace MueLu
570
571#endif //ifdef HAVE_MPI
572
573#endif // MUELU_REPARTITIONFACTORY_DEF_HPP
#define SET_VALID_ENTRY(name)
#define MueLu_maxAll(rcpComm, in, out)
#define MueLu_sumAll(rcpComm, in, out)
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
int GetLevelID() const
Return level number.
static std::string PrintImporterInfo(RCP< const Import > importer, const std::string &msgTag)
void Build(Level &currentLevel) const
Build an object with this factory.
void DeterminePartitionPlacement(const Matrix &A, GOVector &decomposition, GO numPartitions, bool willAcceptPartition=true, bool allSubdomainsAcceptPartitions=true) const
Determine which process should own each partition.
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionFactory needs, and the factories that generate that data.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
static RCP< const Xpetra::BlockedMap< LocalOrdinal, GlobalOrdinal, Node > > GeneratedBlockedTargetMap(const Xpetra::BlockedMap< LocalOrdinal, GlobalOrdinal, Node > &sourceBlockedMap, const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > &Importer)
Namespace for MueLu classes and methods.
static bool compareTriplets(const Triplet< T, W > &a, const Triplet< T, W > &b)
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Statistics1
Print more statistics.
@ Runtime0
One-liner description of what is happening.