1 #ifndef MADNESS_SYSTOLIC_H
2 #define MADNESS_SYSTOLIC_H
46 class SystolicMatrixAlgorithm :
public TaskInterface {
48 DistributedMatrix<T>& A;
55 std::vector<T*> iptr, jptr;
56 std::vector<int64_t> map;
58 void iteration(
const TaskThreadEnv& env) {
66 A.local_colrange(ilo, ihi);
68 int neven = coldim + (coldim&0x1);
70 int pairlo = rank*A.coltile()/2;
72 int threadid = env.id();
73 int nthread = env.nthread();
75 for (
int loop=0; loop<(neven-1); ++loop) {
78 for (
int pair=env.id(); pair<nlocal; pair+=nthread) {
80 int rp = neven/2-1-(pair+pairlo);
81 int iii = (rp+loop)%(neven-1);
82 int jjj = (2*neven-2-rp+loop)%(neven-1);
83 if (rp == 0) jjj = neven-1;
89 kernel(iii, jjj, iptr[pair], jptr[pair]);
94 if (threadid == 0) cycle();
114 if (nlocal <= 0)
return;
115 Tensor<T>& t = A.data();
116 Tensor<T> tmp(2L, t.dims(),
false);
118 for (int64_t i=0; i<nlocal; ++i) {
119 memcpy(tp+i*rowdim, iptr[i], rowdim*
sizeof(
T));
121 memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*
sizeof(
T));
124 jptr[i] = &t(i+nlocal,0);
126 memcpy(t.ptr(), tmp.ptr(), t.size()*
sizeof(
T));
128 if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
133 if (coldim <= 2)
return;
135 MADNESS_ASSERT(rank >= nproc);
140 MADNESS_ASSERT(A.local_coldim() == A.coltile() || rank == (nproc-1));
205 T* ilast = iptr[nlocal-1];
209 for (int64_t i=0; i<nlocal-1; ++i) {
210 iptr[nlocal-i-1] = iptr[nlocal-i-2];
214 World& world = A.get_world();
218 jptr[nlocal-2] = ilast;
220 else if (rank == 0) {
222 world.mpi.Send(ilast, rowdim, right, tag);
223 jptr[nlocal-1] = ilast;
224 world.mpi.Recv(ilast, rowdim, right, tag);
226 else if (rank == (nproc-1)) {
229 jptr[nlocal-2] = ilast;
231 std::vector<T> buf(rowdim);
233 world.mpi.Send(iptr[0], rowdim, left, tag);
234 world.await(req,
false);
235 std::memcpy(iptr[0], &buf[0], rowdim*
sizeof(
T));
238 std::vector<T> buf1(rowdim);
239 std::vector<T> buf2(rowdim);
242 world.mpi.Send( ilast, rowdim, right, tag);
243 world.mpi.Send(jfirst, rowdim, left, tag);
244 world.await(req1,
false);
245 world.await(req2,
false);
246 std::memcpy(ilast, &buf2[0], rowdim*
sizeof(
T));
247 std::memcpy(jfirst, &buf1[0], rowdim*
sizeof(
T));
250 jptr[nlocal-1] = ilast;
257 virtual void get_id(std::pair<void*,unsigned short>&
id)
const {
270 , nproc(A.process_coldim()*A.process_rowdim())
273 , nlocal((A.local_coldim()+1)/2)
278 , map(coldim+(coldim&0x1))
285 Tensor<T>& t = A.
data();
289 for (int64_t i=0; i<nlocal; ++i) {
291 jptr[i] = &t(i+nlocal,0);
295 if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
299 int neven = (coldim+1)/2;
304 int p_nlocal = (hi - lo + 2)/2;
306 for (
int i=0; i<p_nlocal; ++i) {
309 map[ii+i+neven] = lo+i+p_nlocal;
314 std::reverse(map.begin(),map.begin()+neven);
327 virtual void kernel(
int i,
int j,
T* rowi,
T* rowj) = 0;
359 if (env.
id() == 0) unshuffle();
int id() const
Definition: worldthread.h:309
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:593
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:599
int64_t get_rowdim() const
Returns length of row.
Definition: madness/tensor/systolic.h:373
void get_colrange(int p, int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on processor p.
Definition: apps/ii/systolic.h:137
Used to pass info about thread environment into users task.
Definition: worldthread.h:289
bool barrier() const
Definition: worldthread.h:311
World & get_world() const
Returns associated world.
Definition: apps/ii/systolic.h:152
Tensor< T > & data()
Returns reference to data.
Definition: apps/ii/systolic.h:155
void solve_sequential()
Invoked by the user to run the algorithm with one thread mostly for debugging.
Definition: madness/tensor/systolic.h:368
World & get_world() const
Returns a reference to the world.
Definition: apps/ii/systolic.h:631
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads.
Definition: madness/tensor/systolic.h:354
This header should include pretty much everything needed for the parallel runtime.
virtual ~SystolicMatrixAlgorithm()
Definition: madness/tensor/systolic.h:319
Defines and implements most of Tensor.
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
static std::size_t size()
Returns number of threads in the pool.
Definition: worldthread.h:1040
ProcessID get_rank() const
Returns rank of this process in the world.
Definition: madness/tensor/systolic.h:385
A parallel world with full functionality wrapping an MPI communicator.
Definition: worldfwd.h:416
int ProcessID
Used to clearly identify process number/rank.
Definition: worldtypes.h:37
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix.
Definition: safempi.h:243
int64_t get_coldim() const
Returns length of column.
Definition: madness/tensor/systolic.h:377
static enable_if_c< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: worldthread.h:680
bool is_column_distributed() const
Returns true if the matrix is column distributed (i.e., row dimension not distributed) ...
Definition: apps/ii/systolic.h:161
int64_t coltile() const
Returns the column tile size.
Definition: apps/ii/systolic.h:86
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted.
Definition: worldthread.h:769
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2.
Definition: madness/tensor/systolic.h:268
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces.
Definition: chem/atomutil.cc:45
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence.
Manages data associated with a row/column/block distributed array.
Definition: apps/ii/systolic.h:51