[synclust.git] / R / main.R

#' Direct clustering from a neighborhoods graph, or get regions from (Poisson) 
#' distribution parameters optimization, using convex relaxation.
#'
#' @param method Global method: "direct" or "convex"
#' @param M Matrix of observations in rows, the two last columns 
#'        corresponding to geographic coordinates; 
#'        set to NULL to use our initial dataset (625 rows / 9 years)
#' @param k Number of neighbors
#' @param alpha Weight parameter for intra-neighborhoods distance computations; 
#'        0 = take only geographic coordinates into account; 
#'        1 = take only observations over the years into account; 
#'        in-between : several levels of compromise; 
#'        -1 or any negative value : use a heuristic to choose alpha.
#' @param gmode Neighborhood type. 0 = reduced [mutual] kNN; 1 = augmented kNN (symmetric); 
#'        2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric). 
#'        NOTE: gmode==3 automatically sets k==4 (at most!)
#' @param K Number of clusters
#' @param dtype Distance type, in {"simple","spath","ectd"}. 
#'        NOTE: better avoid "simple" if gmode>=2
#' @param cmeth Clustering method, in {"KM","HC","spec"} for k-means (distances based) 
#'        or hierarchical clustering, or spectral clustering (only if gmode>=2)
#' @param pcoef Penalty value for convex optimization [default: 1.0]
#' @param h Step in the min LL algorithm [default: 1e-3]
#' @param eps Threshold to stop min.LL iterations [default: 1e-3]
#' @param maxit Maximum number of iterations in the min LL algo [default: 1e3]
#' @param showLL Print trace of log-likelihood evolution [default: true]
#' @param disp True [default] for interactive display (otherwise nothing gets plotted)
#' @return list with the following entries. M: data matrix in input; NI: computed neighborhoods; 
#'         dists: computed distances matrix; clusts: partition into K clusters, as an integer vector; 
#'         cxpar: parameters obtained after convex optimization (if applicable)
#' @export
#' @examples
#' cvr = findSyncVarRegions("convex",M=NULL,k=10,alpha=0.1,gmode=1,K=5,dtype="spath",cmeth="HC")
#' drawMapWithSitez(cvr$M, cvr$clusters)
#' drawNeighboroodGraph(cvr$M, cvr$NI)
#'
findSyncVarRegions = function(method, M, k, alpha, gmode, K, dtype, cmeth,
	pcoef=1.0, h=1e-3, eps=1e-3, maxit=1e3, showLL=TRUE, disp=TRUE)
{
	#get matrix M if not directly provided
	if (is.null(M))
	{
		data("example", package="synclust")
		M = synclust_sample
	}
	if (is.character(M))
		M = as.matrix(read.table(M))

	n = nrow(M)
	m = ncol(M)

	#pretreatment for neighborhoods search: standardize M columns
	#TODO: maybe apply only on coordinates columns ?
	std = standardize(M)

	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)

	#optional intermediate display : map + graph (monocolor)
	if (disp)
		promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)

	clusters = rep(1,n)
	distances = matrix(NA,nrow=n,ncol=n)
	cxpar = list()

	## DIRECT CLUSTERING ##
	if (method=="direct")
	{
		if (gmode >= 2)
			stop("'gmode' must be 0 or 1 for direct clustering")
		if (dtype=="simple")
			stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")

		#find connected components in the graph defined by NI
		cc = reordering(.Call("getConnectedComponents", NI$ix))
		nbC = max(cc)
		if (nbC > 10)
			stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
		if (nbC > 1)
			print(paste("*** WARNING:",nbC,"connex components ***"))
		clusters = cc

		#for each connected component...
		for (i in 1:nbC)
		{
			indices = (1:n)[cc == i]
			nc = length(indices)
			if (nc <= 1)
				next #nothing to do with such a small component
			
			if (nbC > 1)
			{
				doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
				if (doClust == "y")
					K = readline(">>> into how many groups ? (int >= 2)\n")
				else
					next
			}
			
			#0] remap NI in current connex component
			locNI = remapNeighbors(NI, indices)
			
			#1] determine similarity and distance matrices (e.g. using a random walk)
			locDists = getDistances(dtype, locNI)
			distances[indices,indices] = locDists
			
			#2] cluster data inside connex component according to distance matrix
			locClusters = getClusters(locDists, cmeth, K)
			maxInd = max(clusters)
			clusters[indices] = locClusters + maxInd #avoid indices overlaps
		}
	}

	## CONVEX RELAXATION ##
	else if (method=="convex")
	{		
		#preliminary: remove NA's by averaging over each serie's values
		M = replaceNAs(M)
		
		#use NI$ix and matrix M to estimate initial parameters,
		#and then iterate until convergence to get f + theta
		#theta == mean observations count at each site s
		#f == estimated variations at each site ("time-series" of T points)
		cxpar = .Call("getVarsWithConvexOptim", 
			M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
		f = cxpar$f #the only one we use (others can be checked by user later)
		
		#cluster "time-series" f, using simple kmeans/HC, spect.clust, 
		#or [in a graph] KM or HC, after redefining a NI (using f only)
		
		if (dtype=="simple")
		{
			#use R core functions
			if (cmeth=="KM")
				clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
			else if (cmeth=="HC")
			{
				hct = hclust(dist(f), method="ward")
				clusters = cutree(hct, K)
			}
			else if (cmeth=="spec")
			{
				require(kernlab)
				clusters = as.integer(specc(f, K, kpar="automatic"))
			}
		}
		
		else
		{
			# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
			#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
			NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
			
			#find connected components in the graph defined by NI
			cc = reordering(.Call("getConnectedComponents", NI$ix))
			
			nbC = max(cc)
			if (nbC > 10) 
				stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
			if (nbC > 1) 
				print(paste("*** WARNING:",nbC,"connex components ***"))
			clusters = cc
			
			#for each connected component...
			for (i in 1:nbC)
			{
				indices = (1:n)[cc == i]
				nc = length(indices)
				if (nc <= 1)
					next #nothing to do with such a small component
				
				if (nbC > 1)
				{
					doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
					if (doClust == "y")
						K = readline(">>> into how many groups ? (int >= 2)\n")
					else
						next
				}
				
				#0] remap NI in current connex component
				locNI = remapNeighbors(NI, indices)
				
				#1] determine similarity and distance matrices (e.g. using a random walk)
				locDists = getDistances(dtype, locNI)
				distances[indices,indices] = locDists
				
				#2] cluster data inside connex component according to distance matrix
				locClusters = getClusters(locDists, cmeth, K)
				maxInd = max(clusters)
				clusters[indices] = locClusters + maxInd #avoid indices overlaps
			}
		}
	}

	clusters = reordering(clusters)
	#optional final display : map with clusters colors
	if (disp)
		promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)

	#give back matrix M as given to the function
	M = destandardize(std)

	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
}
Commit	Line	Data
ad26cb61 BA	1	#' Direct clustering from a neighborhoods graph, or get regions from (Poisson)
	2	#' distribution parameters optimization, using convex relaxation.
	3	#'
	4	#' @param method Global method: "direct" or "convex"
	5	#' @param M Matrix of observations in rows, the two last columns
	6	#' corresponding to geographic coordinates;
	7	#' set to NULL to use our initial dataset (625 rows / 9 years)
	8	#' @param k Number of neighbors
	9	#' @param alpha Weight parameter for intra-neighborhoods distance computations;
	10	#' 0 = take only geographic coordinates into account;
	11	#' 1 = take only observations over the years into account;
	12	#' in-between : several levels of compromise;
	13	#' -1 or any negative value : use a heuristic to choose alpha.
	14	#' @param gmode Neighborhood type. 0 = reduced [mutual] kNN; 1 = augmented kNN (symmetric);
	15	#' 2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric).
	16	#' NOTE: gmode==3 automatically sets k==4 (at most!)
	17	#' @param K Number of clusters
	18	#' @param dtype Distance type, in {"simple","spath","ectd"}.
	19	#' NOTE: better avoid "simple" if gmode>=2
	20	#' @param cmeth Clustering method, in {"KM","HC","spec"} for k-means (distances based)
	21	#' or hierarchical clustering, or spectral clustering (only if gmode>=2)
	22	#' @param pcoef Penalty value for convex optimization [default: 1.0]
	23	#' @param h Step in the min LL algorithm [default: 1e-3]
	24	#' @param eps Threshold to stop min.LL iterations [default: 1e-3]
	25	#' @param maxit Maximum number of iterations in the min LL algo [default: 1e3]
	26	#' @param showLL Print trace of log-likelihood evolution [default: true]
	27	#' @param disp True [default] for interactive display (otherwise nothing gets plotted)
	28	#' @return list with the following entries. M: data matrix in input; NI: computed neighborhoods;
	29	#' dists: computed distances matrix; clusts: partition into K clusters, as an integer vector;
	30	#' cxpar: parameters obtained after convex optimization (if applicable)
	31	#' @export
	32	#' @examples
	33	#' cvr = findSyncVarRegions("convex",M=NULL,k=10,alpha=0.1,gmode=1,K=5,dtype="spath",cmeth="HC")
	34	#' drawMapWithSitez(cvr$M, cvr$clusters)
	35	#' drawNeighboroodGraph(cvr$M, cvr$NI)
	36	#'
	37	findSyncVarRegions = function(method, M, k, alpha, gmode, K, dtype, cmeth,
	38	pcoef=1.0, h=1e-3, eps=1e-3, maxit=1e3, showLL=TRUE, disp=TRUE)
	39	{
15d1825d BA	40	#get matrix M if not directly provided
	41	if (is.null(M))
	42	{
	43	data("example", package="synclust")
	44	M = synclust_sample
	45	}
	46	if (is.character(M))
	47	M = as.matrix(read.table(M))
	48
	49	n = nrow(M)
	50	m = ncol(M)
	51
	52	#pretreatment for neighborhoods search: standardize M columns
	53	#TODO: maybe apply only on coordinates columns ?
	54	std = standardize(M)
	55
	56	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	57	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
	58
	59	#optional intermediate display : map + graph (monocolor)
	60	if (disp)
	61	promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
	62
	63	clusters = rep(1,n)
	64	distances = matrix(NA,nrow=n,ncol=n)
	65	cxpar = list()
	66
	67	## DIRECT CLUSTERING ##
	68	if (method=="direct")
	69	{
	70	if (gmode >= 2)
	71	stop("'gmode' must be 0 or 1 for direct clustering")
	72	if (dtype=="simple")
	73	stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
	74
	75	#find connected components in the graph defined by NI
	76	cc = reordering(.Call("getConnectedComponents", NI$ix))
	77	nbC = max(cc)
	78	if (nbC > 10)
	79	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
	80	if (nbC > 1)
	81	print(paste("* WARNING:",nbC,"connex components *"))
	82	clusters = cc
	83
	84	#for each connected component...
	85	for (i in 1:nbC)
	86	{
	87	indices = (1:n)[cc == i]
	88	nc = length(indices)
	89	if (nc <= 1)
	90	next #nothing to do with such a small component
	91
	92	if (nbC > 1)
	93	{
	94	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
	95	if (doClust == "y")
	96	K = readline(">>> into how many groups ? (int >= 2)\n")
	97	else
	98	next
	99	}
	100
	101	#0] remap NI in current connex component
	102	locNI = remapNeighbors(NI, indices)
	103
104	#1] determine similarity and distance matrices (e.g. using a random walk)
105	locDists = getDistances(dtype, locNI)
106	distances[indices,indices] = locDists
107
108	#2] cluster data inside connex component according to distance matrix
109	locClusters = getClusters(locDists, cmeth, K)
110	maxInd = max(clusters)
111	clusters[indices] = locClusters + maxInd #avoid indices overlaps
112	}
113	}
114
115	## CONVEX RELAXATION ##
116	else if (method=="convex")
117	{
118	#preliminary: remove NA's by averaging over each serie's values
119	M = replaceNAs(M)
120
121	#use NI$ix and matrix M to estimate initial parameters,
122	#and then iterate until convergence to get f + theta
123	#theta == mean observations count at each site s
124	#f == estimated variations at each site ("time-series" of T points)
125	cxpar = .Call("getVarsWithConvexOptim",
126	M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
127	f = cxpar$f #the only one we use (others can be checked by user later)
128
129	#cluster "time-series" f, using simple kmeans/HC, spect.clust,
130	#or [in a graph] KM or HC, after redefining a NI (using f only)
131
132	if (dtype=="simple")
133	{
134	#use R core functions
135	if (cmeth=="KM")
136	clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
137	else if (cmeth=="HC")
138	{
139	hct = hclust(dist(f), method="ward")
140	clusters = cutree(hct, K)
141	}
142	else if (cmeth=="spec")
143	{
144	require(kernlab)
145	clusters = as.integer(specc(f, K, kpar="automatic"))
146	}
147	}
148
149	else
150	{
151	# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
152	#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
153	NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
154
155	#find connected components in the graph defined by NI
156	cc = reordering(.Call("getConnectedComponents", NI$ix))
157
158	nbC = max(cc)
159	if (nbC > 10)
160	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
161	if (nbC > 1)
162	print(paste("* WARNING:",nbC,"connex components *"))
163	clusters = cc
164
165	#for each connected component...
166	for (i in 1:nbC)
167	{
168	indices = (1:n)[cc == i]
169	nc = length(indices)
170	if (nc <= 1)
171	next #nothing to do with such a small component
172
173	if (nbC > 1)
174	{
175	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
176	if (doClust == "y")
177	K = readline(">>> into how many groups ? (int >= 2)\n")
178	else
179	next
180	}
181
182	#0] remap NI in current connex component
183	locNI = remapNeighbors(NI, indices)
184
185	#1] determine similarity and distance matrices (e.g. using a random walk)
186	locDists = getDistances(dtype, locNI)
187	distances[indices,indices] = locDists
188
189	#2] cluster data inside connex component according to distance matrix
190	locClusters = getClusters(locDists, cmeth, K)
191	maxInd = max(clusters)
192	clusters[indices] = locClusters + maxInd #avoid indices overlaps
193	}
194	}
195	}
196
197	clusters = reordering(clusters)
198	#optional final display : map with clusters colors
199	if (disp)
200	promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
201
202	#give back matrix M as given to the function
203	M = destandardize(std)
204
205	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
206	}