35 #ifndef MADNESS_TENSOR_MXM_H__INCLUDED
36 #define MADNESS_TENSOR_MXM_H__INCLUDED
57 template <
typename T,
typename Q,
typename S>
58 static inline void mxm(
long dimi,
long dimj,
long dimk,
68 for (
long i=0; i<dimi; ++i) {
69 for (
long k=0;
k<dimk; ++
k) {
71 #pragma _CRI prefervector
73 for (
long j=0; j<dimj; ++j) {
74 c[i*dimj+j] += a[i*dimk+
k]*b[
k*dimj+j];
82 template <
typename T,
typename Q,
typename S>
84 void mTxm(
long dimi,
long dimj,
long dimk,
96 for (
long k=0;
k<dimk; ++
k) {
97 for (
long j=0; j<dimj; ++j) {
99 #pragma _CRI prefervector
101 for (
long i=0; i<dimi; ++i) {
102 c[i*dimj+j] += a[
k*dimi+i]*b[
k*dimj+j];
109 template <
typename T,
typename Q,
typename S>
110 static inline void mxmT(
long dimi,
long dimj,
long dimk,
122 for (
long i=0; i<dimi; ++i) {
123 for (
long j=0; j<dimj; ++j) {
125 for (
long k=0;
k<dimk; ++
k) {
126 sum += a[i*dimk+
k]*b[j*dimk+
k];
134 template <
typename T,
typename Q,
typename S>
135 static inline void mTxmT(
long dimi,
long dimj,
long dimk,
145 for (
long i=0; i<dimi; ++i) {
147 #pragma _CRI prefervector
149 for (
long j=0; j<dimj; ++j) {
150 for (
long k=0;
k<dimk; ++
k) {
151 c[i*dimj+j] += a[
k*dimi+i]*b[j*dimk+
k];
160 void mTxm(
long dimi,
long dimj,
long dimk,
double*
restrict c,
168 inline void mTxm(
long dimi,
long dimj,
long dimk,
183 long dimk4 = (dimk/4)*4;
184 for (
long i=0; i<dimi; ++i,c+=dimj) {
185 const double* ai = a+i;
187 for (
long k=0;
k<dimk4;
k+=4,ai+=4*dimi,p+=4*dimj) {
188 double ak0i = ai[0 ];
189 double ak1i = ai[dimi];
190 double ak2i = ai[dimi+dimi];
191 double ak3i = ai[dimi+dimi+dimi];
192 const double* bk0 = p;
193 const double* bk1 = p+dimj;
194 const double* bk2 = p+dimj+dimj;
195 const double* bk3 = p+dimj+dimj+dimj;
196 for (
long j=0; j<dimj; ++j) {
197 c[j] += ak0i*bk0[j] + ak1i*bk1[j] + ak2i*bk2[j] + ak3i*bk3[j];
200 for (
long k=dimk4;
k<dimk; ++
k) {
201 double aki = a[
k*dimi+i];
202 const double* bk = b+
k*dimj;
203 for (
long j=0; j<dimj; ++j) {
215 inline void mxmT(
long dimi,
long dimj,
long dimk,
230 long dimi2 = (dimi/2)*2;
231 for (
long i=0; i<dimi2; i+=2) {
232 const double* ai0 = a+i*dimk;
233 const double* ai1 = a+i*dimk+dimk;
235 double*
restrict ci1 = c+i*dimj+dimj;
236 for (
long j=0; j<dimj; ++j) {
239 const double* bj = b + j*dimk;
240 for (
long k=0;
k<dimk; ++
k) {
241 sum0 += ai0[
k]*bj[
k];
242 sum1 += ai1[
k]*bj[
k];
248 for (
long i=dimi2; i<dimi; ++i) {
249 const double* ai = a+i*dimk;
251 for (
long j=0; j<dimj; ++j) {
253 const double* bj = b+j*dimk;
254 for (
long k=0;
k<dimk; ++
k) {
264 inline void mxm(
long dimi,
long dimj,
long dimk,
276 long dimk4 = (dimk/4)*4;
277 for (
long i=0; i<dimi; ++i, c+=dimj,a+=dimk) {
279 for (
long k=0;
k<dimk4;
k+=4,p+=4*dimj) {
281 double aik1 = a[
k+1];
282 double aik2 = a[
k+2];
283 double aik3 = a[
k+3];
284 const double* bk0 = p;
285 const double* bk1 = bk0+dimj;
286 const double* bk2 = bk1+dimj;
287 const double* bk3 = bk2+dimj;
288 for (
long j=0; j<dimj; ++j) {
289 c[j] += aik0*bk0[j] + aik1*bk1[j] + aik2*bk2[j] + aik3*bk3[j];
292 for (
long k=dimk4;
k<dimk; ++
k) {
294 for (
long j=0; j<dimj; ++j) {
295 c[j] += aik*b[
k*dimj+j];
303 inline void mTxmT(
long dimi,
long dimj,
long dimk,
316 long dimj2 = (dimj/2)*2;
318 for (
long klo=0; klo<dimk; klo+=ktile, asave+=ktile*dimi, b+=ktile) {
319 long khi = klo+ktile;
320 if (khi > dimk) khi = dimk;
325 for (
long i=0; i<dimi; ++i,c+=dimj,++
a) {
327 for (
long k=0;
k<nk; ++
k,q+=dimi) ai[
k] = *q;
329 const double* bj0 =
b;
330 for (
long j=0; j<dimj2; j+=2,bj0+=2*dimk) {
331 const double* bj1 = bj0+dimk;
334 for (
long k=0;
k<nk; ++
k) {
335 sum0 += ai[
k]*bj0[
k];
336 sum1 += ai[
k]*bj1[
k];
342 for (
long j=dimj2; j<dimj; ++j,bj0+=dimk) {
344 for (
long k=0;
k<nk; ++
k) {
353 #endif // MADNESS_TENSOR_MXM_H__INCLUDED
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
const int k
Definition: dielectric.cc:184
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
void mxmT(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix * matrix transpose (hand unrolled version)
Definition: mxm.h:215
void mxm(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix * matrix (hand unrolled version)
Definition: mxm.h:264
void mTxm(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix transpose * matrix (hand unrolled version)
Definition: mxm.h:168
const mpreal sum(const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode)
Definition: mpreal.cc:241
#define restrict
Definition: config.h:403
const double c
Definition: gfit.cc:200
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79
void mTxmT(long dimi, long dimj, long dimk, double *restrict csave, const double *restrict asave, const double *restrict b)
Matrix transpose * matrix transpose (hand tiled and unrolled)
Definition: mxm.h:303