+ int nc=*pnc, n=*pn, d=*pd;
+ int dim = d + d*d + d*d*d;
+ //double* W = (double*)malloc(dim*dim*sizeof(double));
+
+ // (Re)Initialize W:
+ for (int j=0; j<dim; j++)
+ {
+ for (int k=0; k<dim; k++)
+ W[j*dim+k] = 0.0;
+ }
+ double* g = (double*)malloc(dim*sizeof(double));
+ omp_set_num_threads(nc >= 1 ? nc : omp_get_num_procs());
+ #pragma omp parallel for
+ for (int i=0; i<n; i++)
+ {
+ // g == gi:
+ for (int j=0; j<d; j++)
+ g[j] = Y[i] * X[mi(i,j,n,d)] - M[j];
+ for (int j=d; j<d+(d*d); j++)
+ {
+ int idx1 = (j-d) % d; //num row
+ int idx2 = ((j-d) - idx1) / d; //num col
+ g[j] = 0.0;
+ if (idx1 == idx2)
+ g[j] -= Y[i];
+ g[j] += Y[i] * X[mi(i,idx1,n,d)]*X[mi(i,idx2,n,d)] - M[j];
+ }
+ for (int j=d+d*d; j<dim; j++)
+ {
+ int idx1 = (j-d-d*d) % d; //num row
+ int idx2 = ((j-d-d*d - idx1) / d) %d; //num col
+ int idx3 = (((j-d-d*d - idx1) / d) - idx2) / d; //num "depth"
+ g[j] = 0.0;
+ if (idx1 == idx2)
+ g[j] -= Y[i] * X[mi(i,idx3,n,d)];
+ if (idx1 == idx3)
+ g[j] -= Y[i] * X[mi(i,idx2,n,d)];
+ if (idx2 == idx3)
+ g[j] -= Y[i] * X[mi(i,idx1,n,d)];
+ g[j] += Y[i] * X[mi(i,idx1,n,d)]*X[mi(i,idx2,n,d)]*X[mi(i,idx3,n,d)] - M[j];
+ }
+ // Add 1/n t(gi) %*% gi to W
+ for (int j=0; j<dim; j++)
+ {
+ // This final nested loop is very costly. Some basic optimisations:
+ double gj = g[j];
+ int baseIdx = j * dim;
+ #pragma GCC unroll 32
+ for (int k=j; k>=0; k--)
+ W[baseIdx+k] += gj * g[k];
+ }
+ }
+ // Normalize W: x 1/n
+ for (int j=0; j<dim; j++)
+ {
+ for (int k=j; k<dim; k++)
+ W[mi(j,k,dim,dim)] /= n;
+ }
+ // Symmetrize W: W[k,j] = W[j,k] for k > j
+ for (int j=0; j<dim; j++)
+ {
+ for (int k=j+1; k<dim; k++)
+ W[mi(k,j,dim,dim)] = W[mi(j,k,dim,dim)];
+ }
+ free(g);