VirtualFluids 0.2.0
Parallel CFD LBM Solver
Loading...
Searching...
No Matches
ExchangeData27.cpp
Go to the documentation of this file.
1//=======================================================================================
2// ____ ____ __ ______ __________ __ __ __ __
3// \ \ | | | | | _ \ |___ ___| | | | | / \ | |
4// \ \ | | | | | |_) | | | | | | | / \ | |
5// \ \ | | | | | _ / | | | | | | / /\ \ | |
6// \ \ | | | | | | \ \ | | | \__/ | / ____ \ | |____
7// \ \ | | |__| |__| \__\ |__| \________/ /__/ \__\ |_______|
8// \ \ | | ________________________________________________________________
9// \ \ | | | ______________________________________________________________|
10// \ \| | | | __ __ __ __ ______ _______
11// \ | | |_____ | | | | | | | | | _ \ / _____)
12// \ | | _____| | | | | | | | | | | \ \ \_______
13// \ | | | | |_____ | \_/ | | | | |_/ / _____ |
14// \ _____| |__| |________| \_______/ |__| |______/ (_______/
15//
16// This file is part of VirtualFluids. VirtualFluids is free software: you can
17// redistribute it and/or modify it under the terms of the GNU General Public
18// License as published by the Free Software Foundation, either version 3 of
19// the License, or (at your option) any later version.
20//
21// VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
22// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
23// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
24// for more details.
25//
26// SPDX-License-Identifier: GPL-3.0-or-later
27// SPDX-FileCopyrightText: Copyright © VirtualFluids Project contributors, see AUTHORS.md in root folder
28//
33//======================================================================================
34
35#include "ExchangeData27.h"
36
37#include <cuda_runtime.h>
38#include <helper_cuda.h>
39
41#include <vector>
42
46#include "Parameter/Parameter.h"
48
49using namespace vf::lbm::dir;
50
51namespace vf::gpu {
52
54// 3D domain decomposition
56
57void collectNodesInSendBufferGPU(Parameter* para, int level, CudaStreamIndex streamIndex,
58 const std::vector<ProcessNeighbor27>& sendProcessNeighborsDevice)
59{
60 cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
61
62 for (const auto& neighbor : sendProcessNeighborsDevice) {
63 GetSendFsPostDev27(para->getParD(level)->distributions.f[0],
64 neighbor.populations[0],
65 para->getParD(level)->distributionsAD.f[0],
66 neighbor.populationsAD[0],
67 neighbor.index,
68 neighbor.numberOfNodes,
69 para->getParD(level)->neighborX,
70 para->getParD(level)->neighborY,
71 para->getParD(level)->neighborZ,
72 para->getParD(level)->numberOfNodes,
73 para->getParD(level)->isEvenTimestep,
74 para->getDiffOn(),
75 para->getParD(level)->numberofthreads,
76 stream);
77 }
78}
79
80void scatterNodesFromRecvBufferGPU(Parameter* para, int level, CudaStreamIndex streamIndex,
81 const std::vector<ProcessNeighbor27>& recvProcessNeighborsDevice)
82{
83 cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
84 for (const auto& neighbor : recvProcessNeighborsDevice) {
85 SetRecvFsPostDev27(para->getParD(level)->distributions.f[0],
86 neighbor.populations[0],
87 para->getParD(level)->distributionsAD.f[0],
88 neighbor.populationsAD[0],
89 neighbor.index,
90 neighbor.numberOfNodes,
91 para->getParD(level)->neighborX,
92 para->getParD(level)->neighborY,
93 para->getParD(level)->neighborZ,
94 para->getParD(level)->numberOfNodes,
95 para->getParD(level)->isEvenTimestep,
96 para->getDiffOn(),
97 para->getParD(level)->numberofthreads,
98 stream);
99 }
100}
101
102void startNonBlockingMpiSend(vf::parallel::Communicator& comm, const std::vector<ProcessNeighbor27>& sendProcessNeighborsHost,
103 const bool diffOn)
104{
105 for (const auto& neighbor : sendProcessNeighborsHost) {
106 comm.sendNonBlocking(neighbor.populations[0], neighbor.numberOfFs, neighbor.rankNeighbor);
107 if (diffOn)
108 comm.sendNonBlocking(neighbor.populationsAD[0], neighbor.numberOfFs, neighbor.rankNeighbor);
109 }
110}
111
113 const bool diffOn)
114{
115 for (const auto& neighbor : recvProcessNeighborsHost) {
116 comm.receiveNonBlocking(neighbor.populations[0], neighbor.numberOfFs, neighbor.rankNeighbor);
117 if (diffOn)
118 comm.receiveNonBlocking(neighbor.populationsAD[0], neighbor.numberOfFs, neighbor.rankNeighbor);
119 }
120}
121
122void copyEdgeNodes(const std::vector<LBMSimulationParameter::EdgeNodePositions>& edgeNodes,
123 const std::vector<ProcessNeighbor27>& recvProcessNeighborsHost,
124 const std::vector<ProcessNeighbor27>& sendProcessNeighborsHost, const bool diffOn)
125{
126#pragma omp parallel for
127 for (int i = 0; i < int(edgeNodes.size()); i++) {
128 const auto& edgeNode = edgeNodes[i];
129 const auto& sendNeighbor = sendProcessNeighborsHost[edgeNode.indexOfProcessNeighborSend];
130 const auto& recvNeighbor = recvProcessNeighborsHost[edgeNode.indexOfProcessNeighborRecv];
131
132 if (edgeNode.indexInSendBuffer >= sendNeighbor.numberOfNodes)
133 // for reduced communication after fine to coarse: only copy send nodes which are not part of the reduced comm
134 continue;
135
137 getDistributionReferences27(sendNeighbor.populations[0], sendNeighbor.numberOfNodes, true);
139 getDistributionReferences27(recvNeighbor.populations[0], recvNeighbor.numberOfNodes, true);
140 forEachDirection([&](auto direction) { (populationsSend.f[direction])[edgeNode.indexInSendBuffer] = (populationsRecv.f[direction])[edgeNode.indexInRecvBuffer]; });
141
142 if (diffOn) {
144 getDistributionReferences27(sendNeighbor.populationsAD[0], sendNeighbor.numberOfNodes, true);
146 getDistributionReferences27(recvNeighbor.populationsAD[0], recvNeighbor.numberOfNodes, true);
147
148 forEachDirection([&](auto direction) { (populationsADSend.f[direction])[edgeNode.indexInSendBuffer] = (populationsADRecv.f[direction])[edgeNode.indexInRecvBuffer]; });
149 }
150 }
151}
152
154 const CudaStreamIndex streamIndex,
155 const std::vector<ProcessNeighbor27>& sendProcessNeighborsDevice,
156 const std::vector<ProcessNeighbor27>& recvProcessNeighborsDevice,
157 const std::vector<ProcessNeighbor27>& sendProcessNeighborsHost,
158 const std::vector<ProcessNeighbor27>& recvProcessNeighborsHost,
159 const std::optional<std::vector<ProcessNeighbor27>>& recvProcessNeighborsHostX,
160 const std::optional<std::vector<LBMSimulationParameter::EdgeNodePositions>>& edgeNodesX,
161 const std::optional<std::vector<ProcessNeighbor27>>& recvProcessNeighborsHostY,
162 const std::optional<std::vector<LBMSimulationParameter::EdgeNodePositions>>& edgeNodesY)
163{
164 cudaStream_t stream = para->getStreamManager()->getStream(streamIndex);
165 const size_t numberOfProcessNeighbors = sendProcessNeighborsHost.size();
169 for (size_t i = 0; i < numberOfProcessNeighbors; i++)
176 if (para->getUseStreams())
187 comm.waitAll();
190 if (0 < numberOfProcessNeighbors)
191 comm.resetRequests();
194 for (size_t i = 0; i < numberOfProcessNeighbors; i++)
197}
198
200// X
203{
204 auto& parD = para->getParDeviceAsReference(level);
205 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsX);
206}
207
209{
210 auto& parD = para->getParDeviceAsReference(level);
211 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsAfterFtoCX);
212}
213
215 int level, CudaStreamIndex streamIndex)
216{
217 auto& parD = para->getParDeviceAsReference(level);
218 auto& parH = para->getParHostAsReference(level);
219
220 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
221 parD.sendProcessNeighborsX, parD.recvProcessNeighborsX,
222 parH.sendProcessNeighborsX, parH.recvProcessNeighborsX);
223}
224
226 int level, CudaStreamIndex streamIndex)
227{
228 auto& parD = para->getParDeviceAsReference(level);
229 auto& parH = para->getParHostAsReference(level);
230
231 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
232 parD.sendProcessNeighborsAfterFtoCX, parD.recvProcessNeighborsAfterFtoCX,
233 parH.sendProcessNeighborsAfterFtoCX, parH.recvProcessNeighborsAfterFtoCX);
234}
235
237{
238 auto& parD = para->getParDeviceAsReference(level);
239 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsX);
240}
241
243{
244 auto& parD = para->getParDeviceAsReference(level);
245 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsAfterFtoCX);
246}
247
249// Y
252{
253 auto& parD = para->getParDeviceAsReference(level);
254 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsY);
255}
256
258{
259 auto& parD = para->getParDeviceAsReference(level);
260 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsAfterFtoCY);
261}
262
264 int level, CudaStreamIndex streamIndex)
265{
266 auto& parD = para->getParDeviceAsReference(level);
267 auto& parH = para->getParHostAsReference(level);
268
269 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
270 parD.sendProcessNeighborsY, parD.recvProcessNeighborsY,
271 parH.sendProcessNeighborsY, parH.recvProcessNeighborsY,
272 parH.recvProcessNeighborsX, parH.edgeNodesXtoY);
273}
274
275
277 int level, CudaStreamIndex streamIndex)
278{
279 auto& parD = para->getParDeviceAsReference(level);
280 auto& parH = para->getParHostAsReference(level);
281
282 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
283 parD.sendProcessNeighborsAfterFtoCY, parD.recvProcessNeighborsAfterFtoCY,
284 parH.sendProcessNeighborsAfterFtoCY, parH.recvProcessNeighborsAfterFtoCY,
285 parH.recvProcessNeighborsAfterFtoCX, parH.edgeNodesXtoY);
286}
287
289{
290 auto& parD = para->getParDeviceAsReference(level);
291 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsY);
292}
293
295{
296 auto& parD = para->getParDeviceAsReference(level);
297 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsAfterFtoCY);
298}
299
301// Z
304{
305 auto& parD = para->getParDeviceAsReference(level);
306 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsZ);
307}
308
310{
311 auto& parD = para->getParDeviceAsReference(level);
312 collectNodesInSendBufferGPU(para, level, streamIndex, parD.sendProcessNeighborsAfterFtoCZ);
313}
314
316 int level, CudaStreamIndex streamIndex)
317{
318 auto& parD = para->getParDeviceAsReference(level);
319 auto& parH = para->getParHostAsReference(level);
320
321 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
322 parD.sendProcessNeighborsZ, parD.recvProcessNeighborsZ,
323 parH.sendProcessNeighborsZ, parH.recvProcessNeighborsZ,
324 parH.recvProcessNeighborsX, parH.edgeNodesXtoZ,
325 parH.recvProcessNeighborsY, parH.edgeNodesYtoZ);
326}
328 int level, CudaStreamIndex streamIndex)
329{
330 auto& parD = para->getParDeviceAsReference(level);
331 auto& parH = para->getParHostAsReference(level);
332
333 exchangeCollDataGPU27(para, comm, cudaMemoryManager, streamIndex,
334 parD.sendProcessNeighborsAfterFtoCZ, parD.recvProcessNeighborsAfterFtoCZ,
335 parH.sendProcessNeighborsAfterFtoCZ, parH.recvProcessNeighborsAfterFtoCZ,
336 parH.recvProcessNeighborsAfterFtoCX, parH.edgeNodesXtoZ,
337 parH.recvProcessNeighborsAfterFtoCY, parH.edgeNodesYtoZ);
338}
339
341{
342 auto& parD = para->getParDeviceAsReference(level);
343 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsZ);
344}
345
347{
348 auto& parD = para->getParDeviceAsReference(level);
349 scatterNodesFromRecvBufferGPU(para, level, streamIndex, parD.recvProcessNeighborsAfterFtoCZ);
350}
351
352}
353
void cudaCopyProcessNeighborFsHtoD(const ProcessNeighbor27 &neighborHost, const ProcessNeighbor27 &neighborDevice) const
void cudaCopyProcessNeighborFsDtoH(const ProcessNeighbor27 &neighborHost, const ProcessNeighbor27 &neighborDevice) const
Class for LBM-parameter management.
Definition Parameter.h:359
std::unique_ptr< CudaStreamManager > & getStreamManager()
std::shared_ptr< LBMSimulationParameter > getParD(int level)
Pointer to instance of LBMSimulationParameter - stored on Device (GPU)
LBMSimulationParameter & getParHostAsReference(int level) const
LBMSimulationParameter & getParDeviceAsReference(int level) const
An abstract class for communication between processes in parallel computation.
virtual void resetRequests()=0
virtual void sendNonBlocking(real *sbuf, int count_s, int destinationRank)=0
virtual void receiveNonBlocking(real *rbuf, int count_r, int sourceRank)=0
virtual void waitAll()=0
std::shared_ptr< T > SPtr
void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
void startNonBlockingMpiReceive(vf::parallel::Communicator &comm, const std::vector< ProcessNeighbor27 > &recvProcessNeighborsHost, const bool diffOn)
void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
Distribute the receive nodes (x direction) from the buffer on the gpu.
void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
void copyEdgeNodes(const std::vector< LBMSimulationParameter::EdgeNodePositions > &edgeNodes, const std::vector< ProcessNeighbor27 > &recvProcessNeighborsHost, const std::vector< ProcessNeighbor27 > &sendProcessNeighborsHost, const bool diffOn)
Copy nodes which are part of the communication in multiple directions.
void exchangeCollDataGPU27(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, const CudaStreamIndex streamIndex, const std::vector< ProcessNeighbor27 > &sendProcessNeighborsDevice, const std::vector< ProcessNeighbor27 > &recvProcessNeighborsDevice, const std::vector< ProcessNeighbor27 > &sendProcessNeighborsHost, const std::vector< ProcessNeighbor27 > &recvProcessNeighborsHost, const std::optional< std::vector< ProcessNeighbor27 > > &recvProcessNeighborsHostX, const std::optional< std::vector< LBMSimulationParameter::EdgeNodePositions > > &edgeNodesX, const std::optional< std::vector< ProcessNeighbor27 > > &recvProcessNeighborsHostY, const std::optional< std::vector< LBMSimulationParameter::EdgeNodePositions > > &edgeNodesY)
Exchange routine for simulations on multiple gpus.
void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, const std::vector< ProcessNeighbor27 > &recvProcessNeighborsDevice)
Distribute the receive nodes from the buffer on the gpu.
void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
void startNonBlockingMpiSend(vf::parallel::Communicator &comm, const std::vector< ProcessNeighbor27 > &sendProcessNeighborsHost, const bool diffOn)
void collectNodesInSendBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, const std::vector< ProcessNeighbor27 > &sendProcessNeighborsDevice)
Routines for data exchange when running simulations on multiple GPUs.
void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
constexpr DistributionReferences27 getDistributionReferences27(real *distributions, const unsigned long long numberOfLBnodes, const bool isEvenTimestep)
void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
Calls exchangeCollDataXGPU27() for exchanging all nodes.
void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::parallel::Communicator &comm, const CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex)
Calls exchangeCollDataGPU27() for exchanging the nodes, which are part of the communication between t...
void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
Collect the send nodes for communication in the x direction in a buffer on the gpu.
void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex)
void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
Distribute the receive nodes (x direction) from the buffer on the gpu.
void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex)
Collect the send nodes for communication in the x direction in a buffer on the gpu.
constexpr void forEachDirection(F func)
Definition D3Q27.h:363