[synclust.git] / R / main.R

#example of "not too bad" parameters
#~ k=10
#~ alpha=0.1 
#~ gmode=1 
#~ K = 5 
#~ dtype = "spath"
#~ cmeth = "HC"
#~ pcoef=??
#~ h=??
#~ eps=??
#~ maxit=??

#MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
#from (Poisson) distribution parameters optimization, using convex relaxation.
findSyncVarRegions = function(
	method, #global method: "direct" or "convex"
	M, #matrix of observations in rows, the two last columns 
	   #corresponding to geographic coordinates; 
	   #set to NULL to use our initial dataset (625 rows / 9 years)
	k, #number of neighbors
	alpha, #weight parameter for intra-neighborhoods distance computations
	       #0 = take only geographic coordinates into account
	       #1 = take only observations over the years into account
	       #in-between : several levels of compromise
	       #-1 or any negative value : use a heuristic to choose alpha
	gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
	       #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
		   #NOTE: gmode==3 automatically sets k==4 (at most!)
	K, #number of clusters
	dtype, #distance type, in {"simple","spath","ectd"}.
	       #NOTE: better avoid "simple" if gmode>=2
	cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) 
	       #or hierarchical clustering, or spectral clustering (only if gmode>=2)
	pcoef=1.0, #penalty value for convex optimization
	h=1e-3, #step in the min LL algorithm
	eps=1e-3, #threshold to stop min.LL iterations
	maxit=1e3, #maximum number of iterations in the min LL algo
	showLL=TRUE, #print trace of log-likelihood evolution
	disp=TRUE #true for interactive display (otherwise nothing gets plotted)
) {
	#get matrix M if not directly provided
	if (is.null(M))
	{
		data("example", package="synclust")
		M = synclust_sample
	}
	if (is.character(M))
		M = as.matrix(read.table(M))

	n = nrow(M)
	m = ncol(M)

	#pretreatment for neighborhoods search: standardize M columns
	#TODO: maybe apply only on coordinates columns ?
	std = standardize(M)

	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)

	#optional intermediate display : map + graph (monocolor)
	if (disp)
		promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)

	clusters = rep(1,n)
	distances = matrix(NA,nrow=n,ncol=n)
	cxpar = list()

	## DIRECT CLUSTERING ##
	if (method=="direct")
	{
		if (gmode >= 2)
			stop("'gmode' must be 0 or 1 for direct clustering")
		if (dtype=="simple")
			stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")

		#find connected components in the graph defined by NI
		cc = reordering(.Call("getConnectedComponents", NI$ix))
		nbC = max(cc)
		if (nbC > 10)
			stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
		if (nbC > 1)
			print(paste("*** WARNING:",nbC,"connex components ***"))
		clusters = cc

		#for each connected component...
		for (i in 1:nbC)
		{
			indices = (1:n)[cc == i]
			nc = length(indices)
			if (nc <= 1)
				next #nothing to do with such a small component
			
			if (nbC > 1)
			{
				doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
				if (doClust == "y")
					K = readline(">>> into how many groups ? (int >= 2)\n")
				else
					next
			}
			
			#0] remap NI in current connex component
			locNI = remapNeighbors(NI, indices)
			
			#1] determine similarity and distance matrices (e.g. using a random walk)
			locDists = getDistances(dtype, locNI)
			distances[indices,indices] = locDists
			
			#2] cluster data inside connex component according to distance matrix
			locClusters = getClusters(locDists, cmeth, K)
			maxInd = max(clusters)
			clusters[indices] = locClusters + maxInd #avoid indices overlaps
		}
	}

	## CONVEX RELAXATION ##
	else if (method=="convex")
	{		
		#preliminary: remove NA's by averaging over each serie's values
		M = replaceNAs(M)
		
		#use NI$ix and matrix M to estimate initial parameters,
		#and then iterate until convergence to get f + theta
		#theta == mean observations count at each site s
		#f == estimated variations at each site ("time-series" of T points)
		cxpar = .Call("getVarsWithConvexOptim", 
			M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
		f = cxpar$f #the only one we use (others can be checked by user later)
		
		#cluster "time-series" f, using simple kmeans/HC, spect.clust, 
		#or [in a graph] KM or HC, after redefining a NI (using f only)
		
		if (dtype=="simple")
		{
			#use R core functions
			if (cmeth=="KM")
				clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
			else if (cmeth=="HC")
			{
				hct = hclust(dist(f), method="ward")
				clusters = cutree(hct, K)
			}
			else if (cmeth=="spec")
			{
				require(kernlab)
				clusters = as.integer(specc(f, K, kpar="automatic"))
			}
		}
		
		else
		{
			# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
			#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
			NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
			
			#find connected components in the graph defined by NI
			cc = reordering(.Call("getConnectedComponents", NI$ix))
			
			nbC = max(cc)
			if (nbC > 10) 
				stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
			if (nbC > 1) 
				print(paste("*** WARNING:",nbC,"connex components ***"))
			clusters = cc
			
			#for each connected component...
			for (i in 1:nbC)
			{
				indices = (1:n)[cc == i]
				nc = length(indices)
				if (nc <= 1)
					next #nothing to do with such a small component
				
				if (nbC > 1)
				{
					doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
					if (doClust == "y")
						K = readline(">>> into how many groups ? (int >= 2)\n")
					else
						next
				}
				
				#0] remap NI in current connex component
				locNI = remapNeighbors(NI, indices)
				
				#1] determine similarity and distance matrices (e.g. using a random walk)
				locDists = getDistances(dtype, locNI)
				distances[indices,indices] = locDists
				
				#2] cluster data inside connex component according to distance matrix
				locClusters = getClusters(locDists, cmeth, K)
				maxInd = max(clusters)
				clusters[indices] = locClusters + maxInd #avoid indices overlaps
			}
		}
	}

	clusters = reordering(clusters)
	#optional final display : map with clusters colors
	if (disp)
		promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)

	#give back matrix M as given to the function
	M = destandardize(std)

	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
}
Commit	Line	Data
15d1825d BA	1	#example of "not too bad" parameters
	2	#~ k=10
	3	#~ alpha=0.1
	4	#~ gmode=1
	5	#~ K = 5
	6	#~ dtype = "spath"
	7	#~ cmeth = "HC"
	8	#~ pcoef=??
	9	#~ h=??
	10	#~ eps=??
	11	#~ maxit=??
	12
	13	#MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
	14	#from (Poisson) distribution parameters optimization, using convex relaxation.
	15	findSyncVarRegions = function(
	16	method, #global method: "direct" or "convex"
	17	M, #matrix of observations in rows, the two last columns
	18	#corresponding to geographic coordinates;
	19	#set to NULL to use our initial dataset (625 rows / 9 years)
	20	k, #number of neighbors
	21	alpha, #weight parameter for intra-neighborhoods distance computations
	22	#0 = take only geographic coordinates into account
	23	#1 = take only observations over the years into account
	24	#in-between : several levels of compromise
	25	#-1 or any negative value : use a heuristic to choose alpha
	26	gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
	27	#2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
	28	#NOTE: gmode==3 automatically sets k==4 (at most!)
	29	K, #number of clusters
	30	dtype, #distance type, in {"simple","spath","ectd"}.
	31	#NOTE: better avoid "simple" if gmode>=2
	32	cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based)
	33	#or hierarchical clustering, or spectral clustering (only if gmode>=2)
	34	pcoef=1.0, #penalty value for convex optimization
	35	h=1e-3, #step in the min LL algorithm
	36	eps=1e-3, #threshold to stop min.LL iterations
	37	maxit=1e3, #maximum number of iterations in the min LL algo
	38	showLL=TRUE, #print trace of log-likelihood evolution
	39	disp=TRUE #true for interactive display (otherwise nothing gets plotted)
	40	) {
	41	#get matrix M if not directly provided
	42	if (is.null(M))
	43	{
	44	data("example", package="synclust")
	45	M = synclust_sample
	46	}
	47	if (is.character(M))
	48	M = as.matrix(read.table(M))
	49
	50	n = nrow(M)
	51	m = ncol(M)
	52
	53	#pretreatment for neighborhoods search: standardize M columns
	54	#TODO: maybe apply only on coordinates columns ?
	55	std = standardize(M)
	56
	57	#get neighborhoods [FALSE because NOT simpleDists; see C code]
	58	NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
	59
	60	#optional intermediate display : map + graph (monocolor)
	61	if (disp)
	62	promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
	63
	64	clusters = rep(1,n)
65	distances = matrix(NA,nrow=n,ncol=n)
66	cxpar = list()
67
68	## DIRECT CLUSTERING ##
69	if (method=="direct")
70	{
71	if (gmode >= 2)
72	stop("'gmode' must be 0 or 1 for direct clustering")
73	if (dtype=="simple")
74	stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
75
76	#find connected components in the graph defined by NI
77	cc = reordering(.Call("getConnectedComponents", NI$ix))
78	nbC = max(cc)
79	if (nbC > 10)
80	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
81	if (nbC > 1)
82	print(paste("* WARNING:",nbC,"connex components *"))
83	clusters = cc
84
85	#for each connected component...
86	for (i in 1:nbC)
87	{
88	indices = (1:n)[cc == i]
89	nc = length(indices)
90	if (nc <= 1)
91	next #nothing to do with such a small component
92
93	if (nbC > 1)
94	{
95	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
96	if (doClust == "y")
97	K = readline(">>> into how many groups ? (int >= 2)\n")
98	else
99	next
100	}
101
102	#0] remap NI in current connex component
103	locNI = remapNeighbors(NI, indices)
104
105	#1] determine similarity and distance matrices (e.g. using a random walk)
106	locDists = getDistances(dtype, locNI)
107	distances[indices,indices] = locDists
108
109	#2] cluster data inside connex component according to distance matrix
110	locClusters = getClusters(locDists, cmeth, K)
111	maxInd = max(clusters)
112	clusters[indices] = locClusters + maxInd #avoid indices overlaps
113	}
114	}
115
116	## CONVEX RELAXATION ##
117	else if (method=="convex")
118	{
119	#preliminary: remove NA's by averaging over each serie's values
120	M = replaceNAs(M)
121
122	#use NI$ix and matrix M to estimate initial parameters,
123	#and then iterate until convergence to get f + theta
124	#theta == mean observations count at each site s
125	#f == estimated variations at each site ("time-series" of T points)
126	cxpar = .Call("getVarsWithConvexOptim",
127	M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
128	f = cxpar$f #the only one we use (others can be checked by user later)
129
130	#cluster "time-series" f, using simple kmeans/HC, spect.clust,
131	#or [in a graph] KM or HC, after redefining a NI (using f only)
132
133	if (dtype=="simple")
134	{
135	#use R core functions
136	if (cmeth=="KM")
137	clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
138	else if (cmeth=="HC")
139	{
140	hct = hclust(dist(f), method="ward")
141	clusters = cutree(hct, K)
142	}
143	else if (cmeth=="spec")
144	{
145	require(kernlab)
146	clusters = as.integer(specc(f, K, kpar="automatic"))
147	}
148	}
149
150	else
151	{
152	# recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
153	#NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
154	NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
155
156	#find connected components in the graph defined by NI
157	cc = reordering(.Call("getConnectedComponents", NI$ix))
158
159	nbC = max(cc)
160	if (nbC > 10)
161	stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
162	if (nbC > 1)
163	print(paste("* WARNING:",nbC,"connex components *"))
164	clusters = cc
165
166	#for each connected component...
167	for (i in 1:nbC)
168	{
169	indices = (1:n)[cc == i]
170	nc = length(indices)
171	if (nc <= 1)
172	next #nothing to do with such a small component
173
174	if (nbC > 1)
175	{
176	doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
177	if (doClust == "y")
178	K = readline(">>> into how many groups ? (int >= 2)\n")
179	else
180	next
181	}
182
183	#0] remap NI in current connex component
184	locNI = remapNeighbors(NI, indices)
185
186	#1] determine similarity and distance matrices (e.g. using a random walk)
187	locDists = getDistances(dtype, locNI)
188	distances[indices,indices] = locDists
189
190	#2] cluster data inside connex component according to distance matrix
191	locClusters = getClusters(locDists, cmeth, K)
192	maxInd = max(clusters)
193	clusters[indices] = locClusters + maxInd #avoid indices overlaps
194	}
195	}
196	}
197
198	clusters = reordering(clusters)
199	#optional final display : map with clusters colors
200	if (disp)
201	promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
202
203	#give back matrix M as given to the function
204	M = destandardize(std)
205
206	return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
207	}