[epclust.git] / epclust / R / clustering.R

# Cluster one full task (nb_curves / ntasks series)
clusteringTask = function(indices, ncores)
{
	cl = parallel::makeCluster(ncores)
	parallel::clusterExport(cl,
		varlist=c("K1","getCoefs"),
		envir=environment())
	repeat
	{
		nb_workers = max( 1, round( length(indices_clust) / nb_series_per_chunk ) )
		indices_workers = lapply(seq_len(nb_workers), function(i) {
			upper_bound = ifelse( i<nb_workers,
				min(nb_series_per_chunk*i,length(indices_clust)), length(indices_clust) )
			indices_clust[(nb_series_per_chunk*(i-1)+1):upper_bound]
		})
		indices_clust = unlist( parallel::parLapply(cl, indices_workers, function(indices)
			computeClusters1(indices, getCoefs, K1)) )
		if (length(indices_clust) == K1)
			break
	}
	parallel::stopCluster(cl_clust)
	if (WER == "end")
		return (indices_clust)
	#WER=="mix"
	computeClusters2(indices_clust, K2, getSeries, to_file=TRUE)
}

# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
computeClusters1 = function(indices, getCoefs, K1)
	indices[ cluster::pam(getCoefs(indices), K1, diss=FALSE)$id.med ]

# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
computeClusters2 = function(indices, K2, getSeries, to_file)
{
	if (is.null(indices))
	{
		#get series from file
	}
#Puis K-means après WER...
	if (WER=="mix" > 0)
	{
		curves = computeSynchrones(indices)
		dists = computeWerDists(curves)
		indices = computeClusters(dists, K2, diss=TRUE)
	}
	if (to_file)
		#write results to file (JUST series ; no possible ID here)
}

# Compute the synchrones curves (sum of clusters elements) from a clustering result
computeSynchrones = function(inds)
	sapply(seq_along(inds), colMeans(getSeries(inds[[i]]$indices,inds[[i]]$ids)))

# Compute the WER distance between the synchrones curves (in columns)
computeWerDist = function(curves)
{
	if (!require("Rwave", quietly=TRUE))
		stop("Unable to load Rwave library")
	n <- nrow(curves)
	delta <- ncol(curves)
	#TODO: automatic tune of all these parameters ? (for other users)
	nvoice   <- 4
	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
	noctave = 13
	# 4 here represent 2^5 = 32 half-hours ~ 1 day
	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	scalevector  <- 2^(4:(noctave * nvoice) / nvoice) * 2
	#condition: ( log2(s0*w0/(2*pi)) - 1 ) * nvoice + 1.5 >= 1
	s0=2
	w0=2*pi
	scaled=FALSE
	s0log = as.integer( (log2( s0*w0/(2*pi) ) - 1) * nvoice + 1.5 )
	totnoct = noctave + as.integer(s0log/nvoice) + 1

	# (normalized) observations node with CWT
	Xcwt4 <- lapply(seq_len(n), function(i) {
		ts <- scale(ts(curves[,i]), center=TRUE, scale=scaled)
		totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
		ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
		#Normalization
		sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
		sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
		sqres / max(Mod(sqres))
	})

	Xwer_dist <- matrix(0., n, n)
	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	for (i in 1:(n-1))
	{
		for (j in (i+1):n)
		{
			#TODO: later, compute CWT here (because not enough storage space for 200k series)
			#      'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
			num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
			WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			wer2    <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
			Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
			Xwer_dist[j,i] <- Xwer_dist[i,j]
		}
	}
	diag(Xwer_dist) <- numeric(n)
	Xwer_dist
}
Commit	Line	Data
7b13d0c2	1	# Cluster one full task (nb_curves / ntasks series)
0e2dce80	2	clusteringTask = function(indices, ncores)
5c652979	3	{
0e2dce80 BA	4	cl = parallel::makeCluster(ncores)
	5	parallel::clusterExport(cl,
	6	varlist=c("K1","getCoefs"),
48108c39	7	envir=environment())
7b13d0c2 BA	8	repeat
	9	{
	10	nb_workers = max( 1, round( length(indices_clust) / nb_series_per_chunk ) )
48108c39	11	indices_workers = lapply(seq_len(nb_workers), function(i) {
7b13d0c2 BA	12	upper_bound = ifelse( i<nb_workers,
7b13d0c2 BA	13	min(nb_series_per_chunk*i,length(indices_clust)), length(indices_clust) )
48108c39 BA	14	indices_clust[(nb_series_per_chunk*(i-1)+1):upper_bound]
48108c39 BA	15	})
0e2dce80 BA	16	indices_clust = unlist( parallel::parLapply(cl, indices_workers, function(indices)
	17	computeClusters1(indices, getCoefs, K1)) )
	18	if (length(indices_clust) == K1)
7b13d0c2 BA	19	break
	20	}
	21	parallel::stopCluster(cl_clust)
0e2dce80 BA	22	if (WER == "end")
	23	return (indices_clust)
	24	#WER=="mix"
	25	computeClusters2(indices_clust, K2, getSeries, to_file=TRUE)
5c652979 BA	26	}
5c652979 BA	27
0e2dce80 BA	28	# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
	29	computeClusters1 = function(indices, getCoefs, K1)
	30	indices[ cluster::pam(getCoefs(indices), K1, diss=FALSE)$id.med ]
	31
7b13d0c2	32	# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
0e2dce80	33	computeClusters2 = function(indices, K2, getSeries, to_file)
5c652979	34	{
0e2dce80 BA	35	if (is.null(indices))
	36	{
	37	#get series from file
	38	}
	39	#Puis K-means après WER...
48108c39	40	if (WER=="mix" > 0)
5c652979	41	{
0e2dce80	42	curves = computeSynchrones(indices)
5c652979	43	dists = computeWerDists(curves)
0e2dce80	44	indices = computeClusters(dists, K2, diss=TRUE)
5c652979	45	}
0e2dce80 BA	46	if (to_file)
0e2dce80 BA	47	#write results to file (JUST series ; no possible ID here)
5c652979 BA	48	}
5c652979 BA	49
7b13d0c2	50	# Compute the synchrones curves (sum of clusters elements) from a clustering result
0e2dce80 BA	51	computeSynchrones = function(inds)
0e2dce80 BA	52	sapply(seq_along(inds), colMeans(getSeries(inds[[i]]$indices,inds[[i]]$ids)))
1c6f223e	53
0e2dce80	54	# Compute the WER distance between the synchrones curves (in columns)
7b13d0c2	55	computeWerDist = function(curves)
d03c0621	56	{
5c652979 BA	57	if (!require("Rwave", quietly=TRUE))
5c652979 BA	58	stop("Unable to load Rwave library")
7b13d0c2 BA	59	n <- nrow(curves)
7b13d0c2 BA	60	delta <- ncol(curves)
db6fc17d	61	#TODO: automatic tune of all these parameters ? (for other users)
d03c0621	62	nvoice <- 4
7b13d0c2	63	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
d7d55bc1 BA	64	noctave = 13
d7d55bc1 BA	65	# 4 here represent 2^5 = 32 half-hours ~ 1 day
db6fc17d BA	66	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	67	scalevector <- 2^(4:(noctave * nvoice) / nvoice) * 2
	68	#condition: ( log2(s0w0/(2pi)) - 1 ) * nvoice + 1.5 >= 1
	69	s0=2
	70	w0=2*pi
	71	scaled=FALSE
	72	s0log = as.integer( (log2( s0w0/(2pi) ) - 1) * nvoice + 1.5 )
	73	totnoct = noctave + as.integer(s0log/nvoice) + 1
	74
	75	# (normalized) observations node with CWT
	76	Xcwt4 <- lapply(seq_len(n), function(i) {
0e2dce80	77	ts <- scale(ts(curves[,i]), center=TRUE, scale=scaled)
db6fc17d BA	78	totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
	79	ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
	80	#Normalization
	81	sqs <- sqrt(2^(0:(noctavenvoice)/nvoice)s0)
	82	sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
	83	sqres / max(Mod(sqres))
	84	})
3ccd1e39	85
db6fc17d BA	86	Xwer_dist <- matrix(0., n, n)
	87	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	88	for (i in 1:(n-1))
1c6f223e	89	{
db6fc17d	90	for (j in (i+1):n)
d03c0621	91	{
0e2dce80	92	#TODO: later, compute CWT here (because not enough storage space for 200k series)
db6fc17d BA	93	# 'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
	94	num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	95	WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
	96	WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	97	wer2 <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
	98	Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
	99	Xwer_dist[j,i] <- Xwer_dist[i,j]
d03c0621	100	}
1c6f223e	101	}
d03c0621	102	diag(Xwer_dist) <- numeric(n)
c6556868	103	Xwer_dist
1c6f223e	104	}