#include <stdlib.h>
+#include <omp.h>
// Index matrix (by columns)
-int mi(int i, int j, int d1, int d2)
-{
- return j*d1 + i;
-}
+#define mi(i, j, d1, d2) (j*d1 + i)
// Index 3-tensor (by columns, matrices ordered by last dim)
-int ti(int i, int j, int k, int d1, int d2, int d3)
-{
- return k*d1*d2 + j*d1 + i;
-}
+#define ti(i, j, k, d1, d2, d3) (k*d1*d2 + j*d1 + i)
// Empirical cross-moment of order 2 between X size nxd and Y size n
void Moments_M2(double* X, double* Y, int* pn, int* pd, double* M2)
{
- int n=*pn, d=*pd;
- //double* M2 = (double*)calloc(d*d,sizeof(double));
+ int n=*pn, d=*pd;
+ //double* M2 = (double*)calloc(d*d,sizeof(double));
- // M2 = E[Y*X^*2] - E[Y*e^*2] = E[Y (X^*2 - I)]
- for (int j=0; j<d; j++)
- {
- for (int i=0; i<n; i++)
- {
- M2[mi(j,j,d,d)] -= Y[i] / n;
- for (int k=0; k<d; k++)
- M2[mi(j,k,d,d)] += Y[i] * X[mi(i,j,n,d)]*X[mi(i,k,n,d)] / n;
- }
- }
+ // M2 = E[Y*X^*2] - E[Y*e^*2] = E[Y (X^*2 - I)]
+ for (int j=0; j<d; j++)
+ {
+ for (int i=0; i<n; i++)
+ {
+ M2[mi(j,j,d,d)] -= Y[i] / n;
+ for (int k=0; k<d; k++)
+ M2[mi(j,k,d,d)] += Y[i] * X[mi(i,j,n,d)]*X[mi(i,k,n,d)] / n;
+ }
+ }
}
// Empirical cross-moment of order 3 between X size nxd and Y size n
void Moments_M3(double* X, double* Y, int* pn, int* pd, double* M3)
{
- int n=*pn, d=*pd;
- //double* M3 = (double*)calloc(d*d*d,sizeof(double));
+ int n=*pn, d=*pd;
+ //double* M3 = (double*)calloc(d*d*d,sizeof(double));
- // M3 = E[Y*X^*3] - E[Y*e*X*e] - E[Y*e*e*X] - E[Y*X*e*e]
- for (int j=0; j<d; j++)
- {
- for (int k=0; k<d; k++)
- {
- for (int i=0; i<n; i++)
- {
- double tensor_elt = Y[i]*X[mi(i,k,n,d)] / n;
- M3[ti(j,k,j,d,d,d)] -= tensor_elt;
- M3[ti(j,j,k,d,d,d)] -= tensor_elt;
- M3[ti(k,j,j,d,d,d)] -= tensor_elt;
- for (int o=0; o<d; o++)
- M3[ti(j,k,o,d,d,d)] += Y[i] * X[mi(i,j,n,d)]*X[mi(i,k,n,d)]*X[mi(i,o,n,d)] / n;
- }
- }
- }
+ // M3 = E[Y*X^*3] - E[Y*e*X*e] - E[Y*e*e*X] - E[Y*X*e*e]
+ for (int j=0; j<d; j++)
+ {
+ for (int k=0; k<d; k++)
+ {
+ for (int i=0; i<n; i++)
+ {
+ double tensor_elt = Y[i]*X[mi(i,k,n,d)] / n;
+ M3[ti(j,k,j,d,d,d)] -= tensor_elt;
+ M3[ti(j,j,k,d,d,d)] -= tensor_elt;
+ M3[ti(k,j,j,d,d,d)] -= tensor_elt;
+ for (int o=0; o<d; o++)
+ M3[ti(j,k,o,d,d,d)] += Y[i] * X[mi(i,j,n,d)]*X[mi(i,k,n,d)]*X[mi(i,o,n,d)] / n;
+ }
+ }
+ }
}
-#include <stdio.h>
-
// W = 1/N sum( t(g(Zi,theta)) g(Zi,theta) )
// with g(Zi, theta) = i-th contribution to all moments (size dim) - real moments
-void Compute_Omega(double* X, double* Y, double* M, int* pn, int* pd, double* W)
+void Compute_Omega(double* X, int* Y, double* M, int* pnc, int* pn, int* pd, double* W)
{
- int n=*pn, d=*pd;
+ int nc=*pnc, n=*pn, d=*pd;
int dim = d + d*d + d*d*d;
-
-//printf("X: \n");
-//for (int kk=0; kk<d*n; kk++) printf("%f ",X[kk]);
-//printf("\n");
-//printf("Y: \n");
-//for (int kk=0; kk<n; kk++) printf("%f ",Y[kk]);
-//printf("\n");
-//printf("M: \n");
-//for (int kk=0; kk<dim; kk++) printf("%f ",M[kk]);
-//printf("\n");
+ //double* W = (double*)malloc(dim*dim*sizeof(double));
// (Re)Initialize W:
for (int j=0; j<dim; j++)
for (int k=0; k<dim; k++)
W[j*dim+k] = 0.0;
}
-
- //double* W = (double*)calloc(dim*dim,sizeof(double));
double* g = (double*)malloc(dim*sizeof(double));
+ omp_set_num_threads(nc >= 1 ? nc : omp_get_num_procs());
+ #pragma omp parallel for
for (int i=0; i<n; i++)
{
- // Fill gi:
+ // g == gi:
for (int j=0; j<d; j++)
g[j] = Y[i] * X[mi(i,j,n,d)] - M[j];
for (int j=d; j<d+(d*d); j++)
g[j] = 0.0;
if (idx1 == idx2)
g[j] -= Y[i];
- g[j] += Y[i] * X[mi(i,idx1,n,d)]*X[mi(i,idx2,n,d)] - M[j];
+ g[j] += Y[i] * X[mi(i,idx1,n,d)]*X[mi(i,idx2,n,d)] - M[j];
}
for (int j=d+d*d; j<dim; j++)
{
g[j] -= Y[i] * X[mi(i,idx1,n,d)];
g[j] += Y[i] * X[mi(i,idx1,n,d)]*X[mi(i,idx2,n,d)]*X[mi(i,idx3,n,d)] - M[j];
}
-
-//printf("i=%i, g=: \n", i);
-//for (int kk=0; kk<d; kk++) printf("%f ",g[kk]);
-//printf("\n");
-
// Add 1/n t(gi) %*% gi to W
for (int j=0; j<dim; j++)
{
- for (int k=0; k<dim; k++)
- W[j*dim+k] += g[j] * g[k] / n;
+ // This final nested loop is very costly. Some basic optimisations:
+ double gj = g[j];
+ int baseIdx = j * dim;
+ #pragma GCC unroll 32
+ for (int k=j; k>=0; k--)
+ W[baseIdx+k] += gj * g[k];
}
}
+ // Normalize W: x 1/n
+ for (int j=0; j<dim; j++)
+ {
+ for (int k=j; k<dim; k++)
+ W[mi(j,k,dim,dim)] /= n;
+ }
+ // Symmetrize W: W[k,j] = W[j,k] for k > j
+ for (int j=0; j<dim; j++)
+ {
+ for (int k=j+1; k<dim; k++)
+ W[mi(k,j,dim,dim)] = W[mi(j,k,dim,dim)];
+ }
free(g);
-
-// for (int j=0; j<dim; j++)
-// {
-// printf("\n");
-// for (int k=0; k<dim; k++)
-// printf("%f ",W[j*dim+k]);
-// }
}