33 #ifndef MADNESS_SYSTOLIC_H
34 #define MADNESS_SYSTOLIC_H
61 , Pcoldim((n-1)/tilen+1)
62 , Prowdim((m-1)/tilem+1)
64 , Prow(rank - Pcol*Prowdim)
66 , ihi(
std::
min(ilo+tilen-1,n-1))
68 , jhi(
std::
min(jlo+tilem-1,m-1))
69 , idim(
std::
max(ihi-ilo+1,int64_t(0)))
70 , jdim(
std::
max(jhi-jlo+1,int64_t(0)))
74 if (idim>0 && jdim>0) t = Tensor<T>(idim,jdim);
139 int pj = p - pi*Prowdim;
158 const Tensor<T>&
data()
const {
return t;}
184 const int64_t Pcoldim;
185 const int64_t Prowdim;
188 const int64_t ilo,ihi;
189 const int64_t jlo,jhi;
190 const int64_t idim,jdim;
200 template <
typename T>
206 if (world.
size()*coltile < n) coltile = (n-1)/world.
size() + 1;
207 if ((coltile&0x1)) coltile++;
215 template <
typename T>
217 if (world.
size()*rowtile <
m) rowtile = (n-1)/world.
size() + 1;
232 template <
typename T>
250 template <
typename T>
265 template <
typename T>
287 template <
typename T>
292 int64_t mt = ma + b.
coldim();
303 template <
typename T>
307 MADNESS_ASSERT( n == m );
313 for(int64_t i=ilo; i<=ihi; ++i) {
314 result.data()(i-ilo, i) = 1;
321 template <
typename T>
331 std::vector<T*> iptr, jptr;
332 std::vector<int64_t> map;
342 int neven = coldim + (coldim&0x1);
344 int pairlo = rank*A.
coltile()/2;
346 int threadid = env.
id();
349 for (
int loop=0; loop<(neven-1); loop++) {
352 for (
int pair=env.
id(); pair<nlocal; pair+=nthread) {
354 int rp = neven/2-1-(pair+pairlo);
355 int iii = (rp+loop)%(neven-1);
356 int jjj = (2*neven-2-rp+loop)%(neven-1);
357 if (rp == 0) jjj = neven-1;
363 kernel(iii, jjj, iptr[pair], jptr[pair]);
368 if (threadid == 0) cycle();
386 if (nlocal <= 0)
return;
387 Tensor<T>& t = A.
data();
389 Tensor<T> tmp(2
L, t.dims(),
false);
391 for (int64_t i=0; i<nlocal; i++) {
392 memcpy(tp+i*rowdim, iptr[i], rowdim*
sizeof(
T));
394 memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*
sizeof(
T));
397 jptr[i] = &t(i+nlocal,0);
399 memcpy(t.ptr(), tmp.ptr(), t.size()*
sizeof(
T));
401 if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
406 if (coldim <= 2)
return;
408 MADNESS_ASSERT(rank >= nproc);
478 T* ilast = iptr[nlocal-1];
482 for (int64_t i=0; i<nlocal-1; i++) {
483 iptr[nlocal-i-1] = iptr[nlocal-i-2];
492 jptr[nlocal-2] = ilast;
494 else if (rank == 0) {
496 world.
mpi.
Send(ilast, rowdim, right, tag);
497 jptr[nlocal-1] = ilast;
498 world.
mpi.
Recv(ilast, rowdim, right, tag);
500 else if (rank == (nproc-1)) {
503 jptr[nlocal-2] = ilast;
505 world.
mpi.
Send(iptr[0], rowdim, left, tag);
506 world.
mpi.
Recv(iptr[0], rowdim, left, tag);
509 world.
mpi.
Send( ilast, rowdim, right, tag);
510 world.
mpi.
Send(jfirst, rowdim, left, tag);
511 world.
mpi.
Recv( ilast, rowdim, right, tag);
512 world.
mpi.
Recv(jfirst, rowdim, left, tag);
514 jptr[nlocal-1] = ilast;
521 virtual void get_id(std::pair<void*,unsigned short>&
id)
const {
529 , nproc(A.process_coldim()*A.process_rowdim())
532 , nlocal((A.local_coldim()+1)/2)
537 , map(coldim+(coldim&0x1))
544 Tensor<T>& t = A.
data();
548 for (int64_t i=0; i<nlocal; i++) {
550 jptr[i] = &t(i+nlocal,0);
554 if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
558 int neven = (coldim+1)/2;
563 int p_nlocal = (hi - lo + 2)/2;
565 for (
int i=0; i<p_nlocal; i++) {
568 map[ii+i+neven] = lo+i+p_nlocal;
573 std::reverse(map.begin(),map.begin()+neven);
581 virtual void kernel(
int i,
int j,
T* rowi,
T* rowj) = 0;
612 if (env.
id() == 0) unshuffle();
638 template <
typename T>
643 const double thresh = 1e-6,
const double thetamax = 0.5,
const bool randomized =
true,
const bool doprint =
false):
664 void kernel(
int i,
int j,
T* rowi,
T* rowj);
671 std::vector<int> set;
672 long nmo, niter, ndone, ndone_iter;
673 volatile int64_t nrot;
674 const double thetamax, thresh;
676 volatile double maxtheta;
677 const bool randomized, doprint;
680 inline T DIP(
const T ij[],
const T kl[])
const;
681 inline T inner(
const T a[],
const T b[] )
const;
683 template <
typename T>
690 M.local_colrange(ilo, ihi);
691 for(int64_t i=0; i <=(ihi-ilo); i++){
692 T ii[] = { M.data()(i,i+ilo+nmo), M.data()(i,i+ilo+2*nmo), M.data()(i,i+ilo+3*nmo) };
696 if (env.
id() == 0) world.gop.sum(sum);
700 printf(
"\titeration %ld sum=%.4f ndone=%ld tol=%.2e, maxtheta=%.2e\n", niter, sum, ndone, tol, maxtheta);
705 this->maxtheta = 0.0;
709 template <
typename T>
712 if(set[i] != set[j])
return;
717 T *ri[] = { rowi + nmo, rowi + 2*nmo, rowi + 3*nmo };
718 T *rj[] = { rowj + nmo, rowj + 2*nmo, rowj + 3*nmo };
724 double g = DIP(ij, jj) - DIP(ij, ii);
725 double h = 4.0*DIP(ij, ij) + 2.0*DIP(ii, jj) - DIP(ii, ii) - DIP(jj, jj);
728 if (doprint)
print(
"\t\tforcing negative h", i, j, h);
732 double theta = -g / h;
734 this->maxtheta = std::max<double>(
fabs(theta), (double)maxtheta);
737 if (
fabs(theta) > thetamax){
738 if (doprint)
print(
"\t\trestricting", i,j);
740 if (g < 0) theta = -thetamax;
741 else theta = thetamax * 0.8;
749 if (
fabs(theta) >= tol || randomized || doit){
750 if (doprint)
print(
"\t\trotating", i,j, theta);
753 double c =
cos(theta);
754 double s =
sin(theta);
755 drot(ri[0], rj[0], s, c);
756 drot(ri[1], rj[1], s, c);
757 drot(ri[2], rj[2], s, c);
761 template <
typename T>
766 this->tol =
std::max(0.1 * maxtheta, thresh);
767 this->ndone_iter = nrot;
770 world.gop.sum(ndone_iter);
771 this->ndone += ndone_iter;
775 template <
typename T>
778 if( ndone_iter == 0 && tol <= thresh){
779 if (doprint)
madness::print(
"\tBoys localization converged in", ndone,
"steps.");
782 else if(niter >= 300){
783 if(doprint)
madness::print(
"\tWARNING!! Boys localization did not fully converged in", niter,
"iteration!\n");
790 template <
typename T>
793 for (
long i=0; i<this->nmo; i++ ) {
794 T aa = a[i]*cos - b[i]*
sin;
795 T bb = a[i]*sin + b[i]*
cos;
800 template <
typename T>
801 inline T LocalizeBoys<T>::DIP(
const T ij[],
const T kl[])
const
803 return ij[0] * kl[0] + ij[1] * kl[1] + ij[2] * kl[2];
805 template <
typename T>
809 for(int64_t i=0; i<nmo; i++){
int id() const
Definition: worldthread.h:309
NDIM const Function< R, NDIM > & g
Definition: mra.h:2179
void solve()
Invoked by the user to run the algorithm with one thread.
Definition: apps/ii/systolic.h:621
int64_t local_size() const
Returns the total no. of elements stored on this processor.
Definition: apps/ii/systolic.h:98
int get_coldim() const
Returns length of column.
Definition: apps/ii/systolic.h:628
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:593
void Recv(T *buf, long lenbuf, int src, int tag) const
Receive data of up to lenbuf elements from process dest.
Definition: worldmpi.h:313
WorldMpiInterface & mpi
MPI interface.
Definition: worldfwd.h:459
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:599
virtual ~DistributedMatrix()
Definition: apps/ii/systolic.h:77
void get_colrange(int p, int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on processor p.
Definition: apps/ii/systolic.h:137
DistributedMatrix(World &world, int64_t n, int64_t m, int64_t coltile, int64_t rowtile)
Definition: apps/ii/systolic.h:53
const double L
Definition: 3dharmonic.cc:123
int64_t rowdim() const
Returns the row dimension of the matrix ... i.e., m for A(n,m)
Definition: apps/ii/systolic.h:83
Used to pass info about thread environment into users task.
Definition: worldthread.h:289
void kernel(int i, int j, T *rowi, T *rowj)
Threadsafe routine to apply the operation to rows i and j of the matrix.
Definition: apps/ii/systolic.h:710
bool barrier() const
Definition: worldthread.h:311
const mpreal cos(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2255
void copyout(Tensor< T > &s) const
Given the full matrix s(n,m), copy out the data range local to this processor.
Definition: apps/ii/systolic.h:172
World & get_world() const
Returns associated world.
Definition: apps/ii/systolic.h:152
const Tensor< T > & data() const
Returns const reference to data.
Definition: apps/ii/systolic.h:158
int get_rowdim() const
Returns length of row.
Definition: apps/ii/systolic.h:625
Definition: mpreal.h:3066
DistributedMatrix< T > column_distributed_matrix(World &world, int64_t n, int64_t m, int64_t coltile=0)
Generates an (n,m) matrix distributed by columns (row dimension is not distributed) ...
Definition: apps/ii/systolic.h:201
Tensor< T > & data()
Returns reference to data.
Definition: apps/ii/systolic.h:155
ProcessID size() const
Returns the number of processes in this world (same as MPI_Comm_size())
Definition: worldfwd.h:533
int64_t coldim() const
Returns the column dimension of the matrix ... i.e., n for A(n,m)
Definition: apps/ii/systolic.h:80
World & get_world() const
Returns a reference to the world.
Definition: apps/ii/systolic.h:631
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads.
Definition: apps/ii/systolic.h:603
int64_t process_rowdim() const
Returns the no. of processors in the row dimension.
Definition: apps/ii/systolic.h:95
This header should include pretty much everything needed for the parallel runtime.
virtual ~SystolicMatrixAlgorithm()
Definition: apps/ii/systolic.h:578
T inner(const vecfunc< T, NDIM > &a, const vecfunc< T, NDIM > &b)
the non-linear solver requires an inner product
Definition: nemo.h:112
bool is_row_distributed() const
Returns true if the matrix is row distributed (i.e., column dimension not distributed) ...
Definition: apps/ii/systolic.h:164
void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:762
Defines and implements most of Tensor.
void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:684
Base class for parallel algorithms that employ a systolic loop to generate all row pairs in parallel...
Definition: apps/ii/systolic.h:322
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
#define max(a, b)
Definition: lda.h:53
const double pi
Definition: navstokes_cosines.cc:91
const mpreal min(const mpreal &x, const mpreal &y)
Definition: mpreal.h:2675
virtual ~LocalizeBoys()
Definition: apps/ii/systolic.h:661
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
Definition: apps/ii/systolic.h:639
void drot(long n, double *restrict a, double *restrict b, double s, double c, long inc)
Simple (?) version of BLAS-1 DROT(N, DX, INCX, DY, INCY, DC, DS)
Definition: chem/distpm.cc:18
void copyin(const Tensor< T > &s)
Given the full matrix(n,m), copy in the data range local to this processor.
Definition: apps/ii/systolic.h:167
static std::size_t size()
Returns number of threads in the pool.
Definition: worldthread.h:1040
int nthread() const
Definition: worldthread.h:307
ProcessID get_rank() const
Returns rank of this process in the world.
Definition: apps/ii/systolic.h:634
A parallel world with full functionality wrapping an MPI communicator.
Definition: worldfwd.h:416
void local_colrange(int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on this processor.
Definition: apps/ii/systolic.h:109
int ProcessID
Used to clearly identify process number/rank.
Definition: worldtypes.h:37
DistributedMatrix< T > idMatrix(const DistributedMatrix< T > &A)
make identity matrix with same propaties of A
Definition: apps/ii/systolic.h:304
const mpreal sum(const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode)
Definition: mpreal.cc:241
const mpreal sin(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2262
const double m
Definition: gfit.cc:199
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix.
DistributedMatrix< T > concatenate_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b concatenated.
Definition: apps/ii/systolic.h:251
int64_t process_coldim() const
Returns the no. of processors in the column dimension.
Definition: apps/ii/systolic.h:92
static enable_if_c< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: worldthread.h:680
DistributedMatrix< T > concatenate_columns(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a row-distributed matrix with rows of a and b contatenated.
Definition: apps/ii/systolic.h:288
DistributedMatrix< T > row_distributed_matrix(World &world, int64_t n, int64_t m, int64_t rowtile=0)
Generates an (n,m) matrix distributed by rows (column dimension is not distributed) ...
Definition: apps/ii/systolic.h:216
bool is_column_distributed() const
Returns true if the matrix is column distributed (i.e., row dimension not distributed) ...
Definition: apps/ii/systolic.h:161
int64_t coltile() const
Returns the column tile size.
Definition: apps/ii/systolic.h:86
void doit(World &world)
Definition: tdse1.cc:753
void print(const A &a)
Print a single item to std::cout terminating with new line.
Definition: print.h:122
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted.
Definition: worldthread.h:769
A slice defines a sub-range or patch of a dimension.
Definition: slice.h:103
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2.
Definition: apps/ii/systolic.h:527
bool converged(const TaskThreadEnv &env) const
Invoked simultaneously by all threads after each sweep to test for convergence.
Definition: apps/ii/systolic.h:776
All world tasks must be derived from this public interface.
Definition: taskfn.h:68
int64_t local_rowdim() const
Returns the no. of row elements stored on this processor.
Definition: apps/ii/systolic.h:104
#define restrict
Definition: config.h:403
const mpreal fabs(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2187
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces.
Definition: chem/atomutil.cc:45
void local_rowrange(int64_t &jlow, int64_t &jhigh) const
Returns the inclusive range of row indices on this processor.
Definition: apps/ii/systolic.h:123
const double c
Definition: gfit.cc:200
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79
DistributedMatrix< T > interleave_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b interleaved.
Definition: apps/ii/systolic.h:233
void Send(const T *buf, long lenbuf, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Send array of lenbuf elements to process dest.
Definition: worldmpi.h:296
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence.
int64_t local_coldim() const
Returns the no. of column elements stored on this processor.
Definition: apps/ii/systolic.h:101
int64_t rowtile() const
Returns the row tile size.
Definition: apps/ii/systolic.h:89
Manages data associated with a row/column/block distributed array.
Definition: apps/ii/systolic.h:51