[synclust.git] / R / main.R

#example of "not too bad" parameters
#~ k=10
#~ alpha=0.1 
#~ gmode=1 
#~ K = 5 
#~ dtype = "spath"
#~ cmeth = "HC"
#~ pcoef=??
#~ h=??
#~ eps=??
#~ maxit=??

#MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
#from (Poisson) distribution parameters optimization, using convex relaxation.
findSyncVarRegions = function(
	method, #global method: "direct" or "convex"
	M, #matrix of observations in rows, the two last columns 
	   #corresponding to geographic coordinates; 
	   #set to NULL to use our initial dataset (625 rows / 9 years)
	k, #number of neighbors
	alpha, #weight parameter for intra-neighborhoods distance computations
	       #0 = take only geographic coordinates into account
	       #1 = take only observations over the years into account
	       #in-between : several levels of compromise
	       #-1 or any negative value : use a heuristic to choose alpha
	gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
	       #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
		   #NOTE: gmode==3 automatically sets k==4 (at most!)
	K, #number of clusters
	dtype, #distance type, in {"simple","spath","ectd"}.
	       #NOTE: better avoid "simple" if gmode>=2
	cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) 
	       #or hierarchical clustering, or spectral clustering (only if gmode>=2)
	pcoef=1.0, #penalty value for convex optimization
	h=1e-3, #step in the min LL algorithm
	eps=1e-3, #threshold to stop min.LL iterations
	maxit=1e3, #maximum number of iterations in the min LL algo
	showLL=TRUE, #print trace of log-likelihood evolution
	disp=TRUE #true for interactive display (otherwise nothing gets plotted)
) {
	#get matrix M if not directly provided
	if (is.null(M))
	{
		data("example", package="synclust")
		M = synclust_sample
	}
	if (is.character(M))
		M = as.matrix(read.table(M))

	n = nrow(M)
	m = ncol(M)

	#pretreatment for neighborhoods search: standardize M columns
	#TODO: maybe apply only on coordinates columns ?
	std = standardize(M)

	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)

	#optional intermediate display : map + graph (monocolor)
	if (disp)
		promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)

	clusters = rep(1,n)
	distances = matrix(NA,nrow=n,ncol=n)
	cxpar = list()

	## DIRECT CLUSTERING ##
	if (method=="direct")
	{
		if (gmode >= 2)
			stop("'gmode' must be 0 or 1 for direct clustering")
		if (dtype=="simple")
			stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")

		#find connected components in the graph defined by NI
		cc = reordering(.Call("getConnectedComponents", NI$ix))
		nbC = max(cc)
		if (nbC > 10)
			stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
		if (nbC > 1)
			print(paste("*** WARNING:",nbC,"connex components ***"))
		clusters = cc

		#for each connected component...
		for (i in 1:nbC)
		{
			indices = (1:n)[cc == i]
			nc = length(indices)
			if (nc <= 1)
				next #nothing to do with such a small component
			
			if (nbC > 1)
			{
				doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
				if (doClust == "y")
					K = readline(">>> into how many groups ? (int >= 2)\n")
				else
					next
			}
			
			#0] remap NI in current connex component
			locNI = remapNeighbors(NI, indices)
			
			#1] determine similarity and distance matrices (e.g. using a random walk)
			locDists = getDistances(dtype, locNI)
			distances[indices,indices] = locDists
			
			#2] cluster data inside connex component according to distance matrix
			locClusters = getClusters(locDists, cmeth, K)
			maxInd = max(clusters)
			clusters[indices] = locClusters + maxInd #avoid indices overlaps
		}
	}

	## CONVEX RELAXATION ##
	else if (method=="convex")
	{		
		#preliminary: remove NA's by averaging over each serie's values
		M = replaceNAs(M)
		
		#use NI$ix and matrix M to estimate initial parameters,
		#and then iterate until convergence to get f + theta
		#theta == mean observations count at each site s
		#f == estimated variations at each site ("time-series" of T points)
		cxpar = .Call("getVarsWithConvexOptim", 
			M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
		f = cxpar$f #the only one we use (others can be checked by user later)
		
		#cluster "time-series" f, using simple kmeans/HC, spect.clust, 
		#or [in a graph] KM or HC, after redefining a NI (using f only)
		
		if (dtype=="simple")
		{
			#use R core functions
			if (cmeth=="KM")
				clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
			else if (cmeth=="HC")
			{
				hct = hclust(dist(f), method="ward")
				clusters = cutree(hct, K)
			}
			else if (cmeth=="spec")
			{
				require(kernlab)
				clusters = as.integer(specc(f, K, kpar="automatic"))
			}
		}
		
		else
		{
			# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
			#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
			NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
			
			#find connected components in the graph defined by NI
			cc = reordering(.Call("getConnectedComponents", NI$ix))
			
			nbC = max(cc)
			if (nbC > 10) 
				stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
			if (nbC > 1) 
				print(paste("*** WARNING:",nbC,"connex components ***"))
			clusters = cc
			
			#for each connected component...
			for (i in 1:nbC)
			{
				indices = (1:n)[cc == i]
				nc = length(indices)
				if (nc <= 1)
					next #nothing to do with such a small component
				
				if (nbC > 1)
				{
					doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
					if (doClust == "y")
						K = readline(">>> into how many groups ? (int >= 2)\n")
					else
						next
				}
				
				#0] remap NI in current connex component
				locNI = remapNeighbors(NI, indices)
				
				#1] determine similarity and distance matrices (e.g. using a random walk)
				locDists = getDistances(dtype, locNI)
				distances[indices,indices] = locDists
				
				#2] cluster data inside connex component according to distance matrix
				locClusters = getClusters(locDists, cmeth, K)
				maxInd = max(clusters)
				clusters[indices] = locClusters + maxInd #avoid indices overlaps
			}
		}
	}

	clusters = reordering(clusters)
	#optional final display : map with clusters colors
	if (disp)
		promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)

	#give back matrix M as given to the function
	M = destandardize(std)

	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
}
Commit	Line	Data
	1	#example of "not too bad" parameters
	2	#~ k=10
	3	#~ alpha=0.1
	4	#~ gmode=1
	5	#~ K = 5
	6	#~ dtype = "spath"
	7	#~ cmeth = "HC"
	8	#~ pcoef=??
	9	#~ h=??
	10	#~ eps=??
	11	#~ maxit=??
	12
	13	#MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
	14	#from (Poisson) distribution parameters optimization, using convex relaxation.
	15	findSyncVarRegions = function(
	16	method, #global method: "direct" or "convex"
	17	M, #matrix of observations in rows, the two last columns
	18	#corresponding to geographic coordinates;
	19	#set to NULL to use our initial dataset (625 rows / 9 years)
	20	k, #number of neighbors
	21	alpha, #weight parameter for intra-neighborhoods distance computations
	22	#0 = take only geographic coordinates into account
	23	#1 = take only observations over the years into account
	24	#in-between : several levels of compromise
	25	#-1 or any negative value : use a heuristic to choose alpha
	26	gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
	27	#2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
	28	#NOTE: gmode==3 automatically sets k==4 (at most!)
	29	K, #number of clusters
	30	dtype, #distance type, in {"simple","spath","ectd"}.
	31	#NOTE: better avoid "simple" if gmode>=2
	32	cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based)
	33	#or hierarchical clustering, or spectral clustering (only if gmode>=2)
	34	pcoef=1.0, #penalty value for convex optimization
	35	h=1e-3, #step in the min LL algorithm
	36	eps=1e-3, #threshold to stop min.LL iterations
	37	maxit=1e3, #maximum number of iterations in the min LL algo
	38	showLL=TRUE, #print trace of log-likelihood evolution
	39	disp=TRUE #true for interactive display (otherwise nothing gets plotted)
	40	) {
	41	#get matrix M if not directly provided
	42	if (is.null(M))
	43	{
	44	data("example", package="synclust")
	45	M = synclust_sample
	46	}
	47	if (is.character(M))
	48	M = as.matrix(read.table(M))
	49
	50	n = nrow(M)
	51	m = ncol(M)
	52
	53	#pretreatment for neighborhoods search: standardize M columns
	54	#TODO: maybe apply only on coordinates columns ?
	55	std = standardize(M)
	56
	57	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	58	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
	59
	60	#optional intermediate display : map + graph (monocolor)
	61	if (disp)
	62	promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
	63
	64	clusters = rep(1,n)
	65	distances = matrix(NA,nrow=n,ncol=n)
	66	cxpar = list()
	67
	68	## DIRECT CLUSTERING ##
	69	if (method=="direct")
	70	{
	71	if (gmode >= 2)
	72	stop("'gmode' must be 0 or 1 for direct clustering")
	73	if (dtype=="simple")
	74	stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
	75
	76	#find connected components in the graph defined by NI
	77	cc = reordering(.Call("getConnectedComponents", NI$ix))
	78	nbC = max(cc)
	79	if (nbC > 10)
	80	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
	81	if (nbC > 1)
	82	print(paste("* WARNING:",nbC,"connex components *"))
	83	clusters = cc
	84
	85	#for each connected component...
	86	for (i in 1:nbC)
	87	{
	88	indices = (1:n)[cc == i]
	89	nc = length(indices)
	90	if (nc <= 1)
	91	next #nothing to do with such a small component
	92
	93	if (nbC > 1)
	94	{
	95	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
	96	if (doClust == "y")
	97	K = readline(">>> into how many groups ? (int >= 2)\n")
	98	else
	99	next
	100	}
	101
	102	#0] remap NI in current connex component
	103	locNI = remapNeighbors(NI, indices)
	104
	105	#1] determine similarity and distance matrices (e.g. using a random walk)
	106	locDists = getDistances(dtype, locNI)
	107	distances[indices,indices] = locDists
	108
	109	#2] cluster data inside connex component according to distance matrix
	110	locClusters = getClusters(locDists, cmeth, K)
	111	maxInd = max(clusters)
	112	clusters[indices] = locClusters + maxInd #avoid indices overlaps
	113	}
	114	}
	115
	116	## CONVEX RELAXATION ##
	117	else if (method=="convex")
	118	{
	119	#preliminary: remove NA's by averaging over each serie's values
	120	M = replaceNAs(M)
	121
	122	#use NI$ix and matrix M to estimate initial parameters,
	123	#and then iterate until convergence to get f + theta
	124	#theta == mean observations count at each site s
	125	#f == estimated variations at each site ("time-series" of T points)
	126	cxpar = .Call("getVarsWithConvexOptim",
	127	M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
	128	f = cxpar$f #the only one we use (others can be checked by user later)
	129
	130	#cluster "time-series" f, using simple kmeans/HC, spect.clust,
	131	#or [in a graph] KM or HC, after redefining a NI (using f only)
	132
	133	if (dtype=="simple")
	134	{
	135	#use R core functions
	136	if (cmeth=="KM")
	137	clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
	138	else if (cmeth=="HC")
	139	{
	140	hct = hclust(dist(f), method="ward")
	141	clusters = cutree(hct, K)
	142	}
	143	else if (cmeth=="spec")
	144	{
	145	require(kernlab)
	146	clusters = as.integer(specc(f, K, kpar="automatic"))
	147	}
	148	}
	149
	150	else
	151	{
	152	# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
	153	#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
	154	NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
	155
	156	#find connected components in the graph defined by NI
	157	cc = reordering(.Call("getConnectedComponents", NI$ix))
	158
	159	nbC = max(cc)
	160	if (nbC > 10)
	161	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
	162	if (nbC > 1)
	163	print(paste("* WARNING:",nbC,"connex components *"))
	164	clusters = cc
	165
	166	#for each connected component...
	167	for (i in 1:nbC)
	168	{
	169	indices = (1:n)[cc == i]
	170	nc = length(indices)
	171	if (nc <= 1)
	172	next #nothing to do with such a small component
	173
	174	if (nbC > 1)
	175	{
	176	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
	177	if (doClust == "y")
	178	K = readline(">>> into how many groups ? (int >= 2)\n")
	179	else
	180	next
	181	}
	182
	183	#0] remap NI in current connex component
	184	locNI = remapNeighbors(NI, indices)
	185
	186	#1] determine similarity and distance matrices (e.g. using a random walk)
	187	locDists = getDistances(dtype, locNI)
	188	distances[indices,indices] = locDists
	189
	190	#2] cluster data inside connex component according to distance matrix
	191	locClusters = getClusters(locDists, cmeth, K)
	192	maxInd = max(clusters)
	193	clusters[indices] = locClusters + maxInd #avoid indices overlaps
	194	}
	195	}
	196	}
	197
	198	clusters = reordering(clusters)
	199	#optional final display : map with clusters colors
	200	if (disp)
	201	promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
	202
	203	#give back matrix M as given to the function
	204	M = destandardize(std)
	205
	206	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
	207	}