prepare EMGLLF / EMGrank wrappers, simplify folder generateTestData
[valse.git] / R / main.R
index b5b5144..42852d3 100644 (file)
--- a/R/main.R
+++ b/R/main.R
-## TODO: turn this code into R
-
-classdef selmix < handle
-    
-    properties (SetAccess = private)
-        %always user defined
-        X; % regression data (size n*p, where n is the number of observations, and p is the number of regressors)
-        Y; % response data (size n*m, where n is the number of observations, and m is the number of responses)
-        
-        %optionally user defined some default values
-        gamma; % power in the penalty
-        mini; % minimum number of iterations for EM algorithm
-        maxi; % maximum number of iterations for EM algorithm
-        eps; % threshold for stopping EM algorithm
-        kmin; % minimum number of components in the mixture
-        kmax; % maximum number of components in the mixture
-        rangmin;
-        rangmax;
-        
-        %computed through the workflow
-        phiInit; % initialisation for the reparametrized conditional mean parameter
-        rhoInit; % initialisation for the reparametrized variance parameter
-        piInit; % initialisation for the proportions
-        tauInit; % initialisation for the allocations probabilities in each component
-        gridLambda = []; % values for the regularization parameter grid
-        A1; %je ne crois pas vraiment qu'il faille les mettre en sortie, d'autant plus qu'on construit une matrice A1 et A2 pour chaque k, et elles sont grandes, donc ca coute un peu cher ...
-        A2;
-        Phi; % collection of estimations for the reparametrized conditional mean parameters
-        Rho; % collection of estimations for the reparametrized variance parameters
-        Pi; % collection of estimations for the proportions parameters
-    end
-    
-    properties (Constant)
-        %immutable
-        seuil = 1e-15;
-    end
-    
-    methods
-        %%%%%%%%%%%%%%%%%%%%%%%
-        %initialize main object
-        %%%%%%%%%%%%%%%%%%%%%%%
-        
-        function sx = selmix(X,Y,varargin)
-            %set defaults for optional inputs
-            optargs = {1.0 5 10 1e-6 2 3 2 3};
-            %replace defaults by user parameters
-            optargs(1:length(varargin)) = varargin;
-            sx.X = X;
-            sx.Y = Y;
-            [sx.gamma,sx.mini,sx.maxi,sx.eps,sx.kmin,sx.kmax,sx.rangmin,sx.rangmax] = optargs{:};
-            %                  z = somme(sx.X,sx.Y);
-            %                  sx.Z=z;
-        end
-        
-        
-        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-        %core workflow: compute all models
-        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-        
-        function initParameters(sx,k)
-            [phi0,rho0,pi0,tau0] = initSmallEM(k,sx.X,sx.Y,sx.eps); %smallEM initializes parameters by k-means and regression model in each component, doing this 20 times, and keeping the values maximizing the likelihood after 10 iterations of the EM algorithm.
-            sx.phiInit = phi0;
-            sx.rhoInit = rho0;
-            sx.piInit  = pi0;
-            sx.tauInit = tau0;
-        end
-        
-        function computeGridLambda(sx)
-            [sx.gridLambda] = grillelambda(sx.phiInit,sx.rhoInit,sx.piInit,sx.tauInit,sx.X,sx.Y,sx.gamma,sx.mini,sx.maxi,sx.eps);
-            % computation of the regularization grid, according to explicit
-            % formulae given by EM algorithm.
-        end
-        
-        function computeRelevantParameters(sx)
-            [sx.A1,sx.A2,sx.Rho,sx.Pi] = selectiontotale(sx.phiInit,sx.rhoInit,sx.piInit,sx.tauInit,sx.mini,sx.maxi,sx.gamma,sx.gridLambda,sx.X,sx.Y,sx.seuil,sx.eps);
-            %select variables according to each regularization parameter
-            %from the grid: sx.A1 corresponding to selected variables, and
-            %sx.A2 corresponding to unselected variables.
-        end
-        
-        function [sx,phi,rho,pi]=runProcedure1(sx)
-            [phi,rho,pi,~] = constructionModelesLassoMLE(sx.phiInit,sx.rhoInit,sx.piInit,sx.tauInit,sx.mini,sx.maxi,sx.gamma,sx.gridLambda,sx.X,sx.Y,sx.seuil,sx.eps,sx.A1,sx.A2);
-            %compute parameter estimations, with the Maximum Likelihood
-            %Estimator, restricted on selected variables.
-        end
-        
-        function [phi] =runProcedure2(sx)
-            [phi,~] = constructionModelesLassoRank(sx.Pi,sx.Rho,sx.mini,sx.maxi,sx.X,sx.Y,sx.eps,sx.A1,sx.rangmin,sx.rangmax);
-            %compute parameter estimations, with the Low Rank
-            %Estimator, restricted on selected variables.
-        end
-        
-        % main loop: over all k and all lambda
-        function run(sx,procedure) % Run the all procedure, 1 with the
-            %maximum likelihood refitting, and 2 with the Low Rank refitting.
-            [p,m,~]=size(sx.phiInit);
-            for k=sx.kmin:sx.kmax
-                k
-                initParameters(sx,k);
-                computeGridLambda(sx);
-                computeRelevantParameters(sx);
-                if (procedure == 1)
-                    [~,phi,rho,pi] = runProcedure1(sx);
-                    Phi2 = sx.Phi;
-                    Rho2 = sx.Rho;
-                    Pi2 = sx.Pi;
-                    p = size(sx.X,2);
-                    m = size(sx.Y,2);
-                    if size(Phi2) == 0
-                        sx.Phi(:,:,1:k,:) = phi;
-                        sx.Rho(:,:,1:k,:) = rho;
-                        sx.Pi(1:k,:) = pi;
-                    else
-                        sx.Phi = zeros(p,m,sx.kmax,size(Phi2,4)+size(phi,4));
-                        sx.Phi(:,:,1:size(Phi2,3),1:size(Phi2,4)) = Phi2;
-                        sx.Phi(:,:,1:k,size(Phi2,4)+1:end) = phi;
-                        sx.Rho = zeros(m,m,sx.kmax,size(Rho2,4)+size(rho,4));
-                        sx.Rho(:,:,1:size(Rho2,3),1:size(Rho2,4)) = Rho2;
-                        sx.Rho(:,:,1:k,size(Rho2,4)+1:end) = rho;
-                        sx.Pi = zeros(sx.kmax,size(Pi2,2)+size(pi,2));
-                        sx.Pi(1:size(Pi2,1),1:size(Pi2,2)) = Pi2;
-                        sx.Pi(1:k,size(Pi2,2)+1:end) = pi;
-                    end
-                else
-                    [phi] = runProcedure2(sx);
-                    phi
-                    Phi2 = sx.Phi;
-                    if size(Phi2,1) == 0
-                        sx.Phi(:,:,1:k,:) = phi;
-                    else
-                        size(Phi2)
-                        sx.Phi = zeros(p,m,sx.kmax,size(Phi2,4)+size(phi,4));
-                        size(sx.Phi)
-                        sx.Phi(:,:,1:size(Phi2,3),1:size(Phi2,4)) = Phi2;
-                        sx.Phi(:,:,1:k,size(Phi2,4)+1:end) = phi;
-                    end
-                    
-                end
-                
-                
-            end
-        end
-        
-        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-        %pruning: select only one (or a few best ?!) model
-        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-        %
-        %              function[model] selectModel(sx)
-        %                      %TODO
-        %                      %model = sxModel(...);
-        %              end
-        
-    end
-    
-end
-
-
-%%%%%%%%%%%%%
-%OLD VERSION:
-
-%~ for k=K
-%~ % On initialise
-%~ initialisation
-%~ disp('Initialisé')
-%~ % On construit la grille des lambdas : variables informatives
-%~ [glambda,phiEmv,rhoEmv,piEmv]=grillelambda(phiInit,rhoInit,piInit,tauInit,x,y,gamma,mini,maxi,tau);
-%~ glambda=glambda(1:3);
-%~ disp('glambda construite')
-%~ % On trouve les variables informatives pour chaque lambda : S est la
-%~ % matrice des coefficients des variables informatives
-%~ % on parallelise à l interieur du selectiontotale()
-%~ [B1,B2,rhoLasso,piLasso]=selectiontotale(phiInit,rhoInit,piInit,tauInit,mini,maxi,gamma,glambda,x,y,10^-15,tau);
-%~ %S1 les variables informatives, S2 celles qui ne le sont pas
-%~ [B1bis,B2bis,glambda2bis,ind,rhoLasso,piLasso]=suppressionmodelesegaux(B1,B2,glambda,rhoLasso,piLasso);
-%~ dessinVariablesSelectionnees;
-%~ disp('Le Lasso est fait')
-%~ % Pour chaque lambda ainsi construit, on veut calculer l'EMV pour la procédure Lasso-MLE
-%~ %On obtient une collection de modèles pour Lasso-MLE
-%~ % ICI AUSSI ON PEUT PARALLELISER a l interieur de constructionModelesLassoMLE
-%~ nombreLambda=size(B1bis,3);
-%~ %[phiLassoMLE,rhoLassoMLE,piLassoMLE,vraisLassoMLE]=constructionModelesLassoMLE(phiInit,rhoInit,piInit, tauInit,mini,maxi,gamma,glambda2bis,X,Y,seuil,tau,B1bis,B2bis)
-%~ %Pour Lasso-Rank
-%~ %on ne garde que les colonnes actives
-%~ B1ter=B1bis(:,1,:);
-%~ %           [B1ter,ind,rhoLasso,piLasso]=suppressionmodelesegaux2(B1ter,rhoLasso,piLasso)
-%~ %Pour chaque lambda, on veut calculer l'estimateur low rank pour la procédure Lasso-Rank
-%~ %On obtient une collection de modèles pour Lasso-Rank
-%~ %ICI AUSSI ON PEUT PARALLELISER a linterieur de constructionModelesLassoRank
-%~ nombreLambda2=size(B1ter,2);
-%~ [phi,lvraisemblance,Z] = constructionModelesLassoRank(...
-%~ piEmv,rhoEmv,mini,maxi,X,Y,tau,B1ter,2,4);
-%~ disp('On a construit les modèles pour les deux procédures')
-%~ end
-%~ %selectionModelesLassoMLE;
-%~ %selectionModelesLassoRank;
-%~ disp('On a sélectionné les modèles dans les deux procédures')
+#' @useDynLib valse
+
+Valse = setRefClass(
+       Class = "Valse",
+
+       fields = c(
+               # User defined
+
+               # regression data (size n*p, where n is the number of observations,
+               # and p is the number of regressors)
+               X = "matrix",
+               # response data (size n*m, where n is the number of observations,
+               # and m is the number of responses)
+               Y = "matrix",
+
+               # Optionally user defined (some default values)
+
+               # power in the penalty
+               gamma = "numeric",
+               # minimum number of iterations for EM algorithm
+               mini = "integer",
+               # maximum number of iterations for EM algorithm
+               maxi = "integer",
+               # threshold for stopping EM algorithm
+               eps = "numeric",
+               # minimum number of components in the mixture
+               kmin = "integer",
+               # maximum number of components in the mixture
+               kmax = "integer",
+               # ranks for the Lasso-Rank procedure
+               rank.min = "integer",
+               rank.max = "integer",
+
+               # Computed through the workflow
+
+               # initialisation for the reparametrized conditional mean parameter
+               phiInit = "numeric",
+               # initialisation for the reparametrized variance parameter
+               rhoInit = "numeric",
+               # initialisation for the proportions
+               piInit = "numeric",
+               # initialisation for the allocations probabilities in each component
+               tauInit = "numeric",
+               # values for the regularization parameter grid
+               gridLambda = "numeric",
+               # je ne crois pas vraiment qu'il faille les mettre en sortie, d'autant plus qu'on construit
+               # une matrice A1 et A2 pour chaque k, et elles sont grandes, donc ca coute un peu cher ...
+               A1 = "integer",
+               A2 = "integer",
+               # collection of estimations for the reparametrized conditional mean parameters
+               Phi = "numeric",
+               # collection of estimations for the reparametrized variance parameters
+               Rho = "numeric",
+               # collection of estimations for the proportions parameters
+               Pi = "numeric",
+
+               #immutable (TODO:?)
+               thresh = "numeric"
+       ),
+
+       methods = list(
+               #######################
+               #initialize main object
+               #######################
+               initialize = function(X,Y,...)
+               {
+                       "Initialize Valse object"
+
+                       callSuper(...)
+
+                       X <<- X
+                       Y <<- Y
+                       gamma <<- ifelse (hasArg("gamma"), gamma, 1.)
+                       mini <<- ifelse (hasArg("mini"), mini, as.integer(5))
+                       maxi <<- ifelse (hasArg("maxi"), maxi, as.integer(10))
+                       eps <<- ifelse (hasArg("eps"), eps, 1e-6)
+                       kmin <<- ifelse (hasArg("kmin"), kmin, as.integer(2))
+                       kmax <<- ifelse (hasArg("kmax"), kmax, as.integer(3))
+                       rank.min <<- ifelse (hasArg("rank.min"), rank.min, as.integer(2))
+                       rank.max <<- ifelse (hasArg("rank.max"), rank.max, as.integer(3))
+                       thresh <<- 1e-15 #immutable (TODO:?)
+               },
+
+               ##################################
+               #core workflow: compute all models
+               ##################################
+
+               initParameters = function(k)
+               {
+                       "Parameters initialization"
+
+                       #smallEM initializes parameters by k-means and regression model in each component,
+                       #doing this 20 times, and keeping the values maximizing the likelihood after 10
+                       #iterations of the EM algorithm.
+                       init = initSmallEM(k,X,Y,eps)
+                       phiInit <<- init$phi0
+                       rhoInit <<- init$rho0
+                       piInit  <<- init$pi0
+                       tauInit <<- init$tau0
+               },
+
+               computeGridLambda = function()
+               {
+                       "computation of the regularization grid"
+                       #(according to explicit formula given by EM algorithm)
+
+                       gridLambda <<- gridLambda(phiInit,rhoInit,piInit,tauInit,X,Y,gamma,mini,maxi,eps)
+               },
+
+               computeRelevantParameters = function()
+               {
+                       "Compute relevant parameters"
+
+                       #select variables according to each regularization parameter
+                       #from the grid: A1 corresponding to selected variables, and
+                       #A2 corresponding to unselected variables.
+                       params = selectiontotale(
+                               phiInit,rhoInit,piInit,tauInit,mini,maxi,gamma,gridLambda,X,Y,thresh,eps)
+                       A1 <<- params$A1
+                       A2 <<- params$A2
+                       Rho <<- params$Rho
+                       Pi <<- params$Pi
+               },
+
+               runProcedure1 = function()
+               {
+                       "Run procedure 1 [EMGLLF]"
+
+                       #compute parameter estimations, with the Maximum Likelihood
+                       #Estimator, restricted on selected variables.
+                       return ( constructionModelesLassoMLE(
+                               phiInit,rhoInit,piInit,tauInit,mini,maxi,gamma,gridLambda,X,Y,thresh,eps,A1,A2) )
+               },
+
+               runProcedure2 = function()
+               {
+                       "Run procedure 2 [EMGrank]"
+
+                       #compute parameter estimations, with the Low Rank
+                       #Estimator, restricted on selected variables.
+                       return ( constructionModelesLassoRank(Pi,Rho,mini,maxi,X,Y,eps,
+                               A1,rank.min,rank.max) )
+               },
+
+               run = function()
+               {
+                       "main loop: over all k and all lambda"
+
+                       # Run the whole procedure, 1 with the
+                       #maximum likelihood refitting, and 2 with the Low Rank refitting.
+                       p = dim(phiInit)[1]
+                       m = dim(phiInit)[2]
+                       for (k in kmin:kmax)
+                       {
+                               print(k)
+                               initParameters(k)
+                               computeGridLambda()
+                               computeRelevantParameters()
+                               if (procedure == 1)
+                               {
+                                       r1 = runProcedure1()
+                                       Phi2 = Phi
+                                       Rho2 = Rho
+                                       Pi2 = Pi
+                                       p = ncol(X)
+                                       m = ncol(Y)
+                                       if (is.null(dim(Phi2))) #test was: size(Phi2) == 0
+                                       {
+                                               Phi[,,1:k] <<- r1$phi
+                                               Rho[,,1:k] <<- r1$rho
+                                               Pi[1:k,] <<- r1$pi
+                                       } else
+                                       {
+                                               Phi <<- array(0., dim=c(p,m,kmax,dim(Phi2)[4]+dim(r1$phi)[4]))
+                                               Phi[,,1:(dim(Phi2)[3]),1:(dim(Phi2)[4])] <<- Phi2
+                                               Phi[,,1:k,dim(Phi2)[4]+1] <<- r1$phi
+                                               Rho <<- array(0., dim=c(m,m,kmax,dim(Rho2)[4]+dim(r1$rho)[4]))
+                                               Rho[,,1:(dim(Rho2)[3]),1:(dim(Rho2)[4])] <<- Rho2
+                                               Rho[,,1:k,dim(Rho2)[4]+1] <<- r1$rho
+                                               Pi <<- array(0., dim=c(kmax,dim(Pi2)[2]+dim(r1$pi)[2]))
+                                               Pi[1:nrow(Pi2),1:ncol(Pi2)] <<- Pi2
+                                               Pi[1:k,ncol(Pi2)+1] <<- r1$pi
+                                       }
+                               } else
+                               {
+                                       phi = runProcedure2()$phi
+                                       Phi2 = Phi
+                                       if (dim(Phi2)[1] == 0)
+                                       {
+                                               Phi[,,1:k,] <<- phi
+                                       } else
+                                       {
+                                               Phi <<- array(0., dim=c(p,m,kmax,dim(Phi2)[4]+dim(phi)[4]))
+                                               Phi[,,1:(dim(Phi2)[3]),1:(dim(Phi2)[4])] <<- Phi2
+                                               Phi[,,1:k,-(1:(dim(Phi2)[4]))] <<- phi
+                                       }
+                               }
+                       }
+               }
+
+               ##################################################
+               #TODO: pruning: select only one (or a few best ?!) model
+               ##################################################
+               #
+               #               function[model] selectModel(
+               #                       #TODO
+               #                       #model = odel(...)
+               #               end
+               # Give at least the slope heuristic and BIC, and AIC ?
+
+               )
+)