[epclust.git] / epclust / R / clustering.R

# Cluster one full task (nb_curves / ntasks series)
clusteringTask = function(indices,getSeries,getSeriesForSynchrones,synchrones_file,
	getCoefs,K1,K2,nb_series_per_chunk,ncores,to_file)
{
	cl = parallel::makeCluster(ncores)
	repeat
	{
		nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
		indices_workers = lapply(seq_len(nb_workers), function(i) {
			upper_bound = ifelse( i<nb_workers,
				min(nb_series_per_chunk*i,length(indices)), length(indices) )
			indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
		})
		indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
			computeClusters1(inds, getCoefs, K1)) )
		if (length(indices_clust) == K1)
			break
	}
	parallel::stopCluster(cl)
	if (K2 == 0)
		return (indices)
	computeClusters2(indices, K2, getSeries, getSeriesForSynchrones, to_file)
	vector("integer",0)
}

# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
computeClusters1 = function(indices, getCoefs, K1)
{
	coefs = getCoefs(indices)
	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]
}

# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
computeClusters2 = function(indices, K2, getSeries, getSeriesForSynchrones, to_file)
{
	curves = computeSynchrones(indices, getSeries, getSeriesForSynchrones)
	dists = computeWerDists(curves)
	medoids = cluster::pam(dists, K2, diss=TRUE)$medoids
	if (to_file)
	{
		serialize(medoids, synchrones_file)
		return (NULL)
	}
	medoids
}

# Compute the synchrones curves (sum of clusters elements) from a clustering result
computeSynchrones = function(indices, getSeries, getSeriesForSynchrones)
{
	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
	series = getSeries(indices)
	#...........
	#sapply(seq_along(inds), colMeans(getSeries(inds[[i]]$indices,inds[[i]]$ids)))
}

# Compute the WER distance between the synchrones curves (in rows)
computeWerDist = function(curves)
{
	if (!require("Rwave", quietly=TRUE))
		stop("Unable to load Rwave library")
	n <- nrow(curves)
	delta <- ncol(curves)
	#TODO: automatic tune of all these parameters ? (for other users)
	nvoice   <- 4
	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
	noctave = 13
	# 4 here represent 2^5 = 32 half-hours ~ 1 day
	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	scalevector  <- 2^(4:(noctave * nvoice) / nvoice) * 2
	#condition: ( log2(s0*w0/(2*pi)) - 1 ) * nvoice + 1.5 >= 1
	s0=2
	w0=2*pi
	scaled=FALSE
	s0log = as.integer( (log2( s0*w0/(2*pi) ) - 1) * nvoice + 1.5 )
	totnoct = noctave + as.integer(s0log/nvoice) + 1

	# (normalized) observations node with CWT
	Xcwt4 <- lapply(seq_len(n), function(i) {
		ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
		totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
		ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
		#Normalization
		sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
		sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
		sqres / max(Mod(sqres))
	})

	Xwer_dist <- matrix(0., n, n)
	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	for (i in 1:(n-1))
	{
		for (j in (i+1):n)
		{
			#TODO: later, compute CWT here (because not enough storage space for 200k series)
			#      'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
			num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
			WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			wer2    <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
			Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
			Xwer_dist[j,i] <- Xwer_dist[i,j]
		}
	}
	diag(Xwer_dist) <- numeric(n)
	Xwer_dist
}
Commit	Line	Data
	1	# Cluster one full task (nb_curves / ntasks series)
	2	clusteringTask = function(indices,getSeries,getSeriesForSynchrones,synchrones_file,
	3	getCoefs,K1,K2,nb_series_per_chunk,ncores,to_file)
	4	{
	5	cl = parallel::makeCluster(ncores)
	6	repeat
	7	{
	8	nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
	9	indices_workers = lapply(seq_len(nb_workers), function(i) {
	10	upper_bound = ifelse( i<nb_workers,
	11	min(nb_series_per_chunk*i,length(indices)), length(indices) )
	12	indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
	13	})
	14	indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
	15	computeClusters1(inds, getCoefs, K1)) )
	16	if (length(indices_clust) == K1)
	17	break
	18	}
	19	parallel::stopCluster(cl)
	20	if (K2 == 0)
	21	return (indices)
	22	computeClusters2(indices, K2, getSeries, getSeriesForSynchrones, to_file)
	23	vector("integer",0)
	24	}
	25
	26	# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
	27	computeClusters1 = function(indices, getCoefs, K1)
	28	{
	29	coefs = getCoefs(indices)
	30	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]
	31	}
	32
	33	# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
	34	computeClusters2 = function(indices, K2, getSeries, getSeriesForSynchrones, to_file)
	35	{
	36	curves = computeSynchrones(indices, getSeries, getSeriesForSynchrones)
	37	dists = computeWerDists(curves)
	38	medoids = cluster::pam(dists, K2, diss=TRUE)$medoids
	39	if (to_file)
	40	{
	41	serialize(medoids, synchrones_file)
	42	return (NULL)
	43	}
	44	medoids
	45	}
	46
	47	# Compute the synchrones curves (sum of clusters elements) from a clustering result
	48	computeSynchrones = function(indices, getSeries, getSeriesForSynchrones)
	49	{
	50	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	51	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
	52	series = getSeries(indices)
	53	#...........
	54	#sapply(seq_along(inds), colMeans(getSeries(inds[[i]]$indices,inds[[i]]$ids)))
	55	}
	56
	57	# Compute the WER distance between the synchrones curves (in rows)
	58	computeWerDist = function(curves)
	59	{
	60	if (!require("Rwave", quietly=TRUE))
	61	stop("Unable to load Rwave library")
	62	n <- nrow(curves)
	63	delta <- ncol(curves)
	64	#TODO: automatic tune of all these parameters ? (for other users)
	65	nvoice <- 4
	66	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
	67	noctave = 13
	68	# 4 here represent 2^5 = 32 half-hours ~ 1 day
	69	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	70	scalevector <- 2^(4:(noctave * nvoice) / nvoice) * 2
	71	#condition: ( log2(s0w0/(2pi)) - 1 ) * nvoice + 1.5 >= 1
	72	s0=2
	73	w0=2*pi
	74	scaled=FALSE
	75	s0log = as.integer( (log2( s0w0/(2pi) ) - 1) * nvoice + 1.5 )
	76	totnoct = noctave + as.integer(s0log/nvoice) + 1
	77
	78	# (normalized) observations node with CWT
	79	Xcwt4 <- lapply(seq_len(n), function(i) {
	80	ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
	81	totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
	82	ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
	83	#Normalization
	84	sqs <- sqrt(2^(0:(noctavenvoice)/nvoice)s0)
	85	sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
	86	sqres / max(Mod(sqres))
	87	})
	88
	89	Xwer_dist <- matrix(0., n, n)
	90	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	91	for (i in 1:(n-1))
	92	{
	93	for (j in (i+1):n)
	94	{
	95	#TODO: later, compute CWT here (because not enough storage space for 200k series)
	96	# 'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
	97	num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	98	WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
	99	WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	100	wer2 <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
	101	Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
	102	Xwer_dist[j,i] <- Xwer_dist[i,j]
	103	}
	104	}
	105	diag(Xwer_dist) <- numeric(n)
	106	Xwer_dist
	107	}