33 #ifndef MADNESS_SYSTOLIC_H 
   34 #define MADNESS_SYSTOLIC_H 
   61             , Pcoldim((n-1)/tilen+1)
 
   62             , Prowdim((m-1)/tilem+1)
 
   64             , Prow(rank - Pcol*Prowdim)
 
   66             , ihi(
std::
min(ilo+tilen-1,n-1))
 
   68             , jhi(
std::
min(jlo+tilem-1,m-1))
 
   69             , idim(
std::
max(ihi-ilo+1,int64_t(0)))
 
   70             , jdim(
std::
max(jhi-jlo+1,int64_t(0)))
 
   74             if (idim>0 && jdim>0) t = Tensor<T>(idim,jdim);
 
  139             int pj = p - pi*Prowdim;
 
  158         const Tensor<T>& 
data()
 const {
return t;}
 
  184         const int64_t Pcoldim;          
 
  185         const int64_t Prowdim;          
 
  188         const int64_t ilo,ihi;          
 
  189         const int64_t jlo,jhi;          
 
  190         const int64_t idim,jdim;        
 
  200     template <
typename T>
 
  206         if (world.
size()*coltile < n) coltile = (n-1)/world.
size() + 1;
 
  207         if ((coltile&0x1)) coltile++;
 
  215     template <
typename T>
 
  217         if (world.
size()*rowtile < 
m) rowtile = (n-1)/world.
size() + 1;
 
  232     template <
typename T>
 
  250     template <
typename T>
 
  265     template <
typename T>
 
  287     template <
typename T>
 
  292         int64_t mt = ma + b.
coldim();
 
  303     template <
typename T>
 
  307             MADNESS_ASSERT( n == m );
 
  313             for(int64_t i=ilo; i<=ihi; ++i) {
 
  314                     result.data()(i-ilo, i) = 1;
 
  321     template <
typename T>
 
  331         std::vector<T*> iptr, jptr;     
 
  332         std::vector<int64_t> map;       
 
  342             int neven = coldim + (coldim&0x1);
 
  344             int pairlo = rank*A.
coltile()/2;
 
  346             int threadid = env.
id();
 
  349             for (
int loop=0; loop<(neven-1); loop++) {
 
  352                 for (
int pair=env.
id(); pair<nlocal; pair+=nthread) {
 
  354                     int rp = neven/2-1-(pair+pairlo);
 
  355                     int iii = (rp+loop)%(neven-1);
 
  356                     int jjj = (2*neven-2-rp+loop)%(neven-1);
 
  357                     if (rp == 0) jjj = neven-1;
 
  363                         kernel(iii, jjj, iptr[pair], jptr[pair]);
 
  368                 if (threadid == 0) cycle();
 
  386             if (nlocal <= 0) 
return;
 
  387             Tensor<T>& t = A.
data();
 
  389             Tensor<T> tmp(2
L, t.dims(), 
false);
 
  391             for (int64_t i=0; i<nlocal; i++) {
 
  392                 memcpy(tp+i*rowdim, iptr[i], rowdim*
sizeof(
T));
 
  394                     memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*
sizeof(
T));
 
  397                 jptr[i] = &t(i+nlocal,0);
 
  399             memcpy(t.ptr(), tmp.ptr(), t.size()*
sizeof(
T));
 
  401             if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
 
  406             if (coldim <= 2) 
return; 
 
  408                 MADNESS_ASSERT(rank >= nproc);
 
  478             T* ilast  = iptr[nlocal-1];
 
  482             for (int64_t i=0; i<nlocal-1; i++) {
 
  483                 iptr[nlocal-i-1] = iptr[nlocal-i-2];
 
  492                 jptr[nlocal-2] = ilast;
 
  494             else if (rank == 0) {
 
  496                 world.
mpi.
Send(ilast, rowdim, right, tag);
 
  497                 jptr[nlocal-1] = ilast;
 
  498                 world.
mpi.
Recv(ilast, rowdim, right, tag);
 
  500             else if (rank == (nproc-1)) {
 
  503                     jptr[nlocal-2] = ilast;
 
  505                 world.
mpi.
Send(iptr[0], rowdim, left, tag);
 
  506                 world.
mpi.
Recv(iptr[0], rowdim, left, tag);
 
  509                 world.
mpi.
Send( ilast, rowdim, right, tag);
 
  510                 world.
mpi.
Send(jfirst, rowdim,  left, tag);
 
  511                 world.
mpi.
Recv( ilast, rowdim, right, tag);
 
  512                 world.
mpi.
Recv(jfirst, rowdim,  left, tag);
 
  514                 jptr[nlocal-1] = ilast;
 
  521         virtual void get_id(std::pair<void*,unsigned short>& 
id)
 const {
 
  529             , nproc(A.process_coldim()*A.process_rowdim())
 
  532             , nlocal((A.local_coldim()+1)/2)
 
  537             , map(coldim+(coldim&0x1))
 
  544             Tensor<T>& t = A.
data();
 
  548             for (int64_t i=0; i<nlocal; i++) {
 
  550                 jptr[i] = &t(i+nlocal,0);
 
  554             if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
 
  558             int neven = (coldim+1)/2;
 
  563                 int p_nlocal = (hi - lo + 2)/2;
 
  565                 for (
int i=0; i<p_nlocal; i++) {
 
  568                     map[ii+i+neven] = lo+i+p_nlocal;
 
  573             std::reverse(map.begin(),map.begin()+neven);
 
  581         virtual void kernel(
int i, 
int j, 
T* rowi, 
T* rowj) = 0;
 
  612             if (env.
id() == 0) unshuffle();
 
  638     template <
typename T>
 
  643                 const double thresh = 1e-6, 
const double thetamax = 0.5, 
const bool randomized = 
true, 
const bool doprint = 
false):
 
  664         void kernel(
int i, 
int j, 
T* rowi, 
T* rowj);
 
  671         std::vector<int> set;
 
  672         long nmo, niter, ndone, ndone_iter; 
 
  673         volatile int64_t nrot; 
 
  674         const double thetamax, thresh;
 
  676         volatile double maxtheta; 
 
  677         const bool randomized, doprint;
 
  680         inline T DIP(
const T ij[], 
const T kl[]) 
const;
 
  681         inline T inner(
const T a[], 
const T b[] ) 
const;
 
  683     template <
typename T>
 
  690             M.local_colrange(ilo, ihi);
 
  691             for(int64_t i=0; i <=(ihi-ilo); i++){
 
  692                 T ii[] = { M.data()(i,i+ilo+nmo), M.data()(i,i+ilo+2*nmo), M.data()(i,i+ilo+3*nmo) };
 
  696             if (env.
id() == 0) world.gop.sum(sum);
 
  700             printf(
"\titeration %ld sum=%.4f ndone=%ld tol=%.2e, maxtheta=%.2e\n", niter, sum, ndone, tol, maxtheta);
 
  705             this->maxtheta = 0.0; 
 
  709     template <
typename T>
 
  712         if(set[i] != set[j]) 
return;
 
  717         T *ri[] = { rowi + nmo, rowi + 2*nmo, rowi + 3*nmo };
 
  718         T *rj[] = { rowj + nmo, rowj + 2*nmo, rowj + 3*nmo };
 
  724         double g = DIP(ij, jj) - DIP(ij, ii);
 
  725         double h = 4.0*DIP(ij, ij) + 2.0*DIP(ii, jj) - DIP(ii, ii) - DIP(jj, jj);
 
  728             if (doprint) 
print(
"\t\tforcing negative h", i, j, h);
 
  732         double theta = -g / h;
 
  734         this->maxtheta = std::max<double>(
fabs(theta), (double)maxtheta); 
 
  737         if (
fabs(theta) > thetamax){
 
  738             if (doprint) 
print(
"\t\trestricting", i,j);
 
  740             if (g < 0) theta = -thetamax;
 
  741             else theta = thetamax * 0.8;
 
  749         if (
fabs(theta) >= tol || randomized || doit){
 
  750             if (doprint) 
print(
"\t\trotating", i,j, theta);
 
  753             double c = 
cos(theta);
 
  754             double s = 
sin(theta);
 
  755             drot(ri[0], rj[0], s, c);
 
  756             drot(ri[1], rj[1], s, c);
 
  757             drot(ri[2], rj[2], s, c);
 
  761     template <
typename T>
 
  766             this->tol = 
std::max(0.1 * maxtheta, thresh);
 
  767             this->ndone_iter = nrot;
 
  770             world.gop.sum(ndone_iter); 
 
  771             this->ndone += ndone_iter;
 
  775     template <
typename T>
 
  778         if( ndone_iter == 0 && tol <= thresh){
 
  779             if (doprint) 
madness::print(
"\tBoys localization converged in", ndone, 
"steps.");
 
  782         else if(niter >= 300){
 
  783             if(doprint) 
madness::print(
"\tWARNING!! Boys localization did not fully converged in", niter, 
"iteration!\n");
 
  790     template <
typename T>
 
  793         for ( 
long i=0; i<this->nmo; i++ ) {
 
  794             T aa = a[i]*cos - b[i]*
sin;
 
  795             T bb = a[i]*sin + b[i]*
cos;
 
  800     template <
typename T>
 
  801     inline T LocalizeBoys<T>::DIP(
const T ij[], 
const T kl[])
 const 
  803         return ij[0] * kl[0] + ij[1] * kl[1] + ij[2] * kl[2];
 
  805     template <
typename T>
 
  809         for(int64_t i=0; i<nmo; i++){
 
int id() const 
Definition: worldthread.h:309
NDIM const Function< R, NDIM > & g
Definition: mra.h:2179
void solve()
Invoked by the user to run the algorithm with one thread. 
Definition: apps/ii/systolic.h:621
int64_t local_size() const 
Returns the total no. of elements stored on this processor. 
Definition: apps/ii/systolic.h:98
int get_coldim() const 
Returns length of column. 
Definition: apps/ii/systolic.h:628
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration. 
Definition: apps/ii/systolic.h:593
void Recv(T *buf, long lenbuf, int src, int tag) const 
Receive data of up to lenbuf elements from process dest. 
Definition: worldmpi.h:313
WorldMpiInterface & mpi
MPI interface. 
Definition: worldfwd.h:459
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration. 
Definition: apps/ii/systolic.h:599
virtual ~DistributedMatrix()
Definition: apps/ii/systolic.h:77
void get_colrange(int p, int64_t &ilow, int64_t &ihigh) const 
Returns the inclusive range of column indices on processor p. 
Definition: apps/ii/systolic.h:137
DistributedMatrix(World &world, int64_t n, int64_t m, int64_t coltile, int64_t rowtile)
Definition: apps/ii/systolic.h:53
const double L
Definition: 3dharmonic.cc:123
int64_t rowdim() const 
Returns the row dimension of the matrix ... i.e., m for A(n,m) 
Definition: apps/ii/systolic.h:83
Used to pass info about thread environment into users task. 
Definition: worldthread.h:289
void kernel(int i, int j, T *rowi, T *rowj)
Threadsafe routine to apply the operation to rows i and j of the matrix. 
Definition: apps/ii/systolic.h:710
bool barrier() const 
Definition: worldthread.h:311
const mpreal cos(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2255
void copyout(Tensor< T > &s) const 
Given the full matrix s(n,m), copy out the data range local to this processor. 
Definition: apps/ii/systolic.h:172
World & get_world() const 
Returns associated world. 
Definition: apps/ii/systolic.h:152
const Tensor< T > & data() const 
Returns const reference to data. 
Definition: apps/ii/systolic.h:158
int get_rowdim() const 
Returns length of row. 
Definition: apps/ii/systolic.h:625
Definition: mpreal.h:3066
DistributedMatrix< T > column_distributed_matrix(World &world, int64_t n, int64_t m, int64_t coltile=0)
Generates an (n,m) matrix distributed by columns (row dimension is not distributed) ...
Definition: apps/ii/systolic.h:201
Tensor< T > & data()
Returns reference to data. 
Definition: apps/ii/systolic.h:155
ProcessID size() const 
Returns the number of processes in this world (same as MPI_Comm_size()) 
Definition: worldfwd.h:533
int64_t coldim() const 
Returns the column dimension of the matrix ... i.e., n for A(n,m) 
Definition: apps/ii/systolic.h:80
World & get_world() const 
Returns a reference to the world. 
Definition: apps/ii/systolic.h:631
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads. 
Definition: apps/ii/systolic.h:603
int64_t process_rowdim() const 
Returns the no. of processors in the row dimension. 
Definition: apps/ii/systolic.h:95
This header should include pretty much everything needed for the parallel runtime. 
virtual ~SystolicMatrixAlgorithm()
Definition: apps/ii/systolic.h:578
T inner(const vecfunc< T, NDIM > &a, const vecfunc< T, NDIM > &b)
the non-linear solver requires an inner product 
Definition: nemo.h:112
bool is_row_distributed() const 
Returns true if the matrix is row distributed (i.e., column dimension not distributed) ...
Definition: apps/ii/systolic.h:164
void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration. 
Definition: apps/ii/systolic.h:762
Defines and implements most of Tensor. 
void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration. 
Definition: apps/ii/systolic.h:684
Base class for parallel algorithms that employ a systolic loop to generate all row pairs in parallel...
Definition: apps/ii/systolic.h:322
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
#define max(a, b)
Definition: lda.h:53
const double pi
Definition: navstokes_cosines.cc:91
const mpreal min(const mpreal &x, const mpreal &y)
Definition: mpreal.h:2675
virtual ~LocalizeBoys()
Definition: apps/ii/systolic.h:661
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
Definition: apps/ii/systolic.h:639
void drot(long n, double *restrict a, double *restrict b, double s, double c, long inc)
Simple (?) version of BLAS-1 DROT(N, DX, INCX, DY, INCY, DC, DS) 
Definition: chem/distpm.cc:18
void copyin(const Tensor< T > &s)
Given the full matrix(n,m), copy in the data range local to this processor. 
Definition: apps/ii/systolic.h:167
static std::size_t size()
Returns number of threads in the pool. 
Definition: worldthread.h:1040
int nthread() const 
Definition: worldthread.h:307
ProcessID get_rank() const 
Returns rank of this process in the world. 
Definition: apps/ii/systolic.h:634
A parallel world with full functionality wrapping an MPI communicator. 
Definition: worldfwd.h:416
void local_colrange(int64_t &ilow, int64_t &ihigh) const 
Returns the inclusive range of column indices on this processor. 
Definition: apps/ii/systolic.h:109
int ProcessID
Used to clearly identify process number/rank. 
Definition: worldtypes.h:37
DistributedMatrix< T > idMatrix(const DistributedMatrix< T > &A)
make identity matrix with same propaties of A 
Definition: apps/ii/systolic.h:304
const mpreal sum(const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode)
Definition: mpreal.cc:241
const mpreal sin(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2262
const double m
Definition: gfit.cc:199
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix. 
DistributedMatrix< T > concatenate_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b concatenated. 
Definition: apps/ii/systolic.h:251
int64_t process_coldim() const 
Returns the no. of processors in the column dimension. 
Definition: apps/ii/systolic.h:92
static enable_if_c< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: worldthread.h:680
DistributedMatrix< T > concatenate_columns(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a row-distributed matrix with rows of a and b contatenated. 
Definition: apps/ii/systolic.h:288
DistributedMatrix< T > row_distributed_matrix(World &world, int64_t n, int64_t m, int64_t rowtile=0)
Generates an (n,m) matrix distributed by rows (column dimension is not distributed) ...
Definition: apps/ii/systolic.h:216
bool is_column_distributed() const 
Returns true if the matrix is column distributed (i.e., row dimension not distributed) ...
Definition: apps/ii/systolic.h:161
int64_t coltile() const 
Returns the column tile size. 
Definition: apps/ii/systolic.h:86
void doit(World &world)
Definition: tdse1.cc:753
void print(const A &a)
Print a single item to std::cout terminating with new line. 
Definition: print.h:122
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted. 
Definition: worldthread.h:769
A slice defines a sub-range or patch of a dimension. 
Definition: slice.h:103
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2. 
Definition: apps/ii/systolic.h:527
bool converged(const TaskThreadEnv &env) const 
Invoked simultaneously by all threads after each sweep to test for convergence. 
Definition: apps/ii/systolic.h:776
All world tasks must be derived from this public interface. 
Definition: taskfn.h:68
int64_t local_rowdim() const 
Returns the no. of row elements stored on this processor. 
Definition: apps/ii/systolic.h:104
#define restrict
Definition: config.h:403
const mpreal fabs(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2187
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces. 
Definition: chem/atomutil.cc:45
void local_rowrange(int64_t &jlow, int64_t &jhigh) const 
Returns the inclusive range of row indices on this processor. 
Definition: apps/ii/systolic.h:123
const double c
Definition: gfit.cc:200
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79
DistributedMatrix< T > interleave_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b interleaved. 
Definition: apps/ii/systolic.h:233
void Send(const T *buf, long lenbuf, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const 
Send array of lenbuf elements to process dest. 
Definition: worldmpi.h:296
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence. 
int64_t local_coldim() const 
Returns the no. of column elements stored on this processor. 
Definition: apps/ii/systolic.h:101
int64_t rowtile() const 
Returns the row tile size. 
Definition: apps/ii/systolic.h:89
Manages data associated with a row/column/block distributed array. 
Definition: apps/ii/systolic.h:51