[epclust.git] / epclust / R / clustering.R

# Cluster one full task (nb_curves / ntasks series)
clusteringTask = function(indices,getSeries,getSeriesForSynchrones,synchrones_file,
	getCoefs,K1,K2,nb_series_per_chunk,ncores,to_file,ftype)
{
	cl = parallel::makeCluster(ncores)
	repeat
	{
		nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
		indices_workers = lapply(seq_len(nb_workers), function(i) {
			upper_bound = ifelse( i<nb_workers,
				min(nb_series_per_chunk*i,length(indices)), length(indices) )
			indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
		})
		indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
			computeClusters1(inds, getCoefs, K1)) )
		if (length(indices_clust) == K1)
			break
	}
	parallel::stopCluster(cl)
	if (K2 == 0)
		return (indices)
	computeClusters2(indices, K2, getSeries, getSeriesForSynchrones, to_file,
									 nb_series_per_chunk,ftype)
	vector("integer",0)
}

# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
computeClusters1 = function(indices, getCoefs, K1)
{
	coefs = getCoefs(indices)
	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]
}

# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
computeClusters2 = function(indices, K2, getSeries, getSeriesForSynchrones, to_file,
														nb_series_per_chunk, ftype)
{
	curves = computeSynchrones(indices, getSeries, getSeriesForSynchrones, nb_series_per_chunk)
	dists = computeWerDists(curves)
	medoids = cluster::pam(dists, K2, diss=TRUE)$medoids
	if (to_file)
	{
		serialize(medoids, synchrones_file, ftype, nb_series_per_chunk)
		return (NULL)
	}
	medoids
}

# Compute the synchrones curves (sum of clusters elements) from a clustering result
computeSynchrones = function(indices, getSeries, getSeriesForSynchrones, nb_series_per_chunk)
{
	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
	medoids = getSeries(indices)
	K = nrow(medoids)
	synchrones = matrix(0, nrow=K, ncol=ncol(medoids))
	counts = rep(0,K)
	index = 1
	repeat
	{
		series = getSeriesForSynchrones((index-1)+seq_len(nb_series_per_chunk))
		if (is.null(series))
			break
		#get medoids indices for this chunk of series
		index = which.min( rowSums( sweep(medoids, 2, series[i,], '-')^2 ) )
		synchrones[index,] = synchrones[index,] + series[i,]
		counts[index] = counts[index] + 1
	}
	#NOTE: odds for some clusters to be empty? (when series already come from stage 2)
	synchrones = sweep(synchrones, 1, counts, '/')
}

# Compute the WER distance between the synchrones curves (in rows)
computeWerDist = function(curves)
{
	if (!require("Rwave", quietly=TRUE))
		stop("Unable to load Rwave library")
	n <- nrow(curves)
	delta <- ncol(curves)
	#TODO: automatic tune of all these parameters ? (for other users)
	nvoice   <- 4
	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
	noctave = 13
	# 4 here represent 2^5 = 32 half-hours ~ 1 day
	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	scalevector  <- 2^(4:(noctave * nvoice) / nvoice) * 2
	#condition: ( log2(s0*w0/(2*pi)) - 1 ) * nvoice + 1.5 >= 1
	s0=2
	w0=2*pi
	scaled=FALSE
	s0log = as.integer( (log2( s0*w0/(2*pi) ) - 1) * nvoice + 1.5 )
	totnoct = noctave + as.integer(s0log/nvoice) + 1

	# (normalized) observations node with CWT
	Xcwt4 <- lapply(seq_len(n), function(i) {
		ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
		totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
		ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
		#Normalization
		sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
		sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
		sqres / max(Mod(sqres))
	})

	Xwer_dist <- matrix(0., n, n)
	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	for (i in 1:(n-1))
	{
		for (j in (i+1):n)
		{
			#TODO: later, compute CWT here (because not enough storage space for 200k series)
			#      'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
			num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
			WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			wer2    <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
			Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
			Xwer_dist[j,i] <- Xwer_dist[i,j]
		}
	}
	diag(Xwer_dist) <- numeric(n)
	Xwer_dist
}
Commit	Line	Data
7b13d0c2	1	# Cluster one full task (nb_curves / ntasks series)
e205f218	2	clusteringTask = function(indices,getSeries,getSeriesForSynchrones,synchrones_file,
3eef8d3d	3	getCoefs,K1,K2,nb_series_per_chunk,ncores,to_file,ftype)
5c652979	4	{
0e2dce80	5	cl = parallel::makeCluster(ncores)
7b13d0c2 BA	6	repeat
7b13d0c2 BA	7	{
e205f218	8	nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
48108c39	9	indices_workers = lapply(seq_len(nb_workers), function(i) {
7b13d0c2	10	upper_bound = ifelse( i<nb_workers,
e205f218 BA	11	min(nb_series_per_chunk*i,length(indices)), length(indices) )
e205f218 BA	12	indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
48108c39	13	})
e205f218 BA	14	indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
e205f218 BA	15	computeClusters1(inds, getCoefs, K1)) )
0e2dce80	16	if (length(indices_clust) == K1)
7b13d0c2 BA	17	break
7b13d0c2 BA	18	}
e205f218 BA	19	parallel::stopCluster(cl)
	20	if (K2 == 0)
	21	return (indices)
3eef8d3d BA	22	computeClusters2(indices, K2, getSeries, getSeriesForSynchrones, to_file,
3eef8d3d BA	23	nb_series_per_chunk,ftype)
e205f218	24	vector("integer",0)
5c652979 BA	25	}
5c652979 BA	26
0e2dce80 BA	27	# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
0e2dce80 BA	28	computeClusters1 = function(indices, getCoefs, K1)
e205f218 BA	29	{
	30	coefs = getCoefs(indices)
	31	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]
	32	}
0e2dce80	33
7b13d0c2	34	# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
3eef8d3d BA	35	computeClusters2 = function(indices, K2, getSeries, getSeriesForSynchrones, to_file,
3eef8d3d BA	36	nb_series_per_chunk, ftype)
5c652979	37	{
3eef8d3d	38	curves = computeSynchrones(indices, getSeries, getSeriesForSynchrones, nb_series_per_chunk)
e205f218 BA	39	dists = computeWerDists(curves)
	40	medoids = cluster::pam(dists, K2, diss=TRUE)$medoids
	41	if (to_file)
5c652979	42	{
3eef8d3d	43	serialize(medoids, synchrones_file, ftype, nb_series_per_chunk)
e205f218	44	return (NULL)
5c652979	45	}
e205f218	46	medoids
5c652979 BA	47	}
5c652979 BA	48
7b13d0c2	49	# Compute the synchrones curves (sum of clusters elements) from a clustering result
3eef8d3d	50	computeSynchrones = function(indices, getSeries, getSeriesForSynchrones, nb_series_per_chunk)
e205f218 BA	51	{
	52	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	53	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
3eef8d3d BA	54	medoids = getSeries(indices)
	55	K = nrow(medoids)
	56	synchrones = matrix(0, nrow=K, ncol=ncol(medoids))
	57	counts = rep(0,K)
	58	index = 1
	59	repeat
	60	{
	61	series = getSeriesForSynchrones((index-1)+seq_len(nb_series_per_chunk))
	62	if (is.null(series))
	63	break
	64	#get medoids indices for this chunk of series
	65	index = which.min( rowSums( sweep(medoids, 2, series[i,], '-')^2 ) )
	66	synchrones[index,] = synchrones[index,] + series[i,]
	67	counts[index] = counts[index] + 1
	68	}
	69	#NOTE: odds for some clusters to be empty? (when series already come from stage 2)
	70	synchrones = sweep(synchrones, 1, counts, '/')
e205f218	71	}
1c6f223e	72
e205f218	73	# Compute the WER distance between the synchrones curves (in rows)
7b13d0c2	74	computeWerDist = function(curves)
d03c0621	75	{
5c652979 BA	76	if (!require("Rwave", quietly=TRUE))
5c652979 BA	77	stop("Unable to load Rwave library")
7b13d0c2 BA	78	n <- nrow(curves)
7b13d0c2 BA	79	delta <- ncol(curves)
db6fc17d	80	#TODO: automatic tune of all these parameters ? (for other users)
d03c0621	81	nvoice <- 4
7b13d0c2	82	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
d7d55bc1 BA	83	noctave = 13
d7d55bc1 BA	84	# 4 here represent 2^5 = 32 half-hours ~ 1 day
db6fc17d BA	85	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	86	scalevector <- 2^(4:(noctave * nvoice) / nvoice) * 2
	87	#condition: ( log2(s0w0/(2pi)) - 1 ) * nvoice + 1.5 >= 1
	88	s0=2
	89	w0=2*pi
	90	scaled=FALSE
	91	s0log = as.integer( (log2( s0w0/(2pi) ) - 1) * nvoice + 1.5 )
	92	totnoct = noctave + as.integer(s0log/nvoice) + 1
	93
	94	# (normalized) observations node with CWT
	95	Xcwt4 <- lapply(seq_len(n), function(i) {
e205f218	96	ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
db6fc17d BA	97	totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
	98	ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
	99	#Normalization
	100	sqs <- sqrt(2^(0:(noctavenvoice)/nvoice)s0)
	101	sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
	102	sqres / max(Mod(sqres))
	103	})
3ccd1e39	104
db6fc17d BA	105	Xwer_dist <- matrix(0., n, n)
	106	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	107	for (i in 1:(n-1))
1c6f223e	108	{
db6fc17d	109	for (j in (i+1):n)
d03c0621	110	{
0e2dce80	111	#TODO: later, compute CWT here (because not enough storage space for 200k series)
db6fc17d BA	112	# 'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
	113	num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	114	WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
	115	WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	116	wer2 <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
	117	Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
	118	Xwer_dist[j,i] <- Xwer_dist[i,j]
d03c0621	119	}
1c6f223e	120	}
d03c0621	121	diag(Xwer_dist) <- numeric(n)
c6556868	122	Xwer_dist
1c6f223e	123	}