[epclust.git] / epclust / R / clustering.R

# Cluster one full task (nb_curves / ntasks series); only step 1
clusteringTask = function(indices, getCoefs, K1, nb_series_per_chunk, ncores)
{
	cl = parallel::makeCluster(ncores)
	repeat
	{
		nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
		indices_workers = lapply(seq_len(nb_workers), function(i) {
			upper_bound = ifelse( i<nb_workers,
				min(nb_series_per_chunk*i,length(indices)), length(indices) )
			indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
		})
		indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
			computeClusters1(getCoefs(inds), K1)) )
		if (length(indices) == K1)
			break
	}
	parallel::stopCluster(cl)
	indices #medoids
}

# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
computeClusters1 = function(coefs, K1)
	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]

# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
computeClusters2 = function(medoids, K2, getRefSeries, nb_series_per_chunk)
{
	synchrones = computeSynchrones(medoids, getRefSeries, nb_series_per_chunk)
	cluster::pam(computeWerDists(synchrones), K2, diss=TRUE)$medoids
}

# Compute the synchrones curves (sum of clusters elements) from a clustering result
computeSynchrones = function(medoids, getRefSeries, nb_series_per_chunk)
{
	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
	K = nrow(medoids)
	synchrones = matrix(0, nrow=K, ncol=ncol(medoids))
	counts = rep(0,K)
	index = 1
	repeat
	{
		range = (index-1) + seq_len(nb_series_per_chunk)
		ref_series = getRefSeries(range)
		if (is.null(ref_series))
			break
		#get medoids indices for this chunk of series
		for (i in seq_len(nrow(ref_series)))
		{
			j = which.min( rowSums( sweep(medoids, 2, series[i,], '-')^2 ) )
			synchrones[j,] = synchrones[j,] + series[i,]
			counts[j] = counts[j] + 1
		}
		index = index + nb_series_per_chunk
	}
	#NOTE: odds for some clusters to be empty? (when series already come from stage 2)
	sweep(synchrones, 1, counts, '/')
}

# Compute the WER distance between the synchrones curves (in rows)
computeWerDist = function(curves)
{
	if (!require("Rwave", quietly=TRUE))
		stop("Unable to load Rwave library")
	n <- nrow(curves)
	delta <- ncol(curves)
	#TODO: automatic tune of all these parameters ? (for other users)
	nvoice   <- 4
	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
	noctave = 13
	# 4 here represent 2^5 = 32 half-hours ~ 1 day
	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	scalevector  <- 2^(4:(noctave * nvoice) / nvoice) * 2
	#condition: ( log2(s0*w0/(2*pi)) - 1 ) * nvoice + 1.5 >= 1
	s0=2
	w0=2*pi
	scaled=FALSE
	s0log = as.integer( (log2( s0*w0/(2*pi) ) - 1) * nvoice + 1.5 )
	totnoct = noctave + as.integer(s0log/nvoice) + 1

	# (normalized) observations node with CWT
	Xcwt4 <- lapply(seq_len(n), function(i) {
		ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
		totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
		ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
		#Normalization
		sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
		sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
		sqres / max(Mod(sqres))
	})

	Xwer_dist <- matrix(0., n, n)
	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	for (i in 1:(n-1))
	{
		for (j in (i+1):n)
		{
			#TODO: later, compute CWT here (because not enough storage space for 200k series)
			#      'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
			num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
			WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
			wer2    <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
			Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
			Xwer_dist[j,i] <- Xwer_dist[i,j]
		}
	}
	diag(Xwer_dist) <- numeric(n)
	Xwer_dist
}
Commit	Line	Data
56857861 BA	1	# Cluster one full task (nb_curves / ntasks series); only step 1
56857861 BA	2	clusteringTask = function(indices, getCoefs, K1, nb_series_per_chunk, ncores)
5c652979	3	{
0e2dce80	4	cl = parallel::makeCluster(ncores)
7b13d0c2 BA	5	repeat
7b13d0c2 BA	6	{
e205f218	7	nb_workers = max( 1, round( length(indices) / nb_series_per_chunk ) )
48108c39	8	indices_workers = lapply(seq_len(nb_workers), function(i) {
7b13d0c2	9	upper_bound = ifelse( i<nb_workers,
e205f218 BA	10	min(nb_series_per_chunk*i,length(indices)), length(indices) )
e205f218 BA	11	indices[(nb_series_per_chunk*(i-1)+1):upper_bound]
48108c39	12	})
e205f218	13	indices = unlist( parallel::parLapply(cl, indices_workers, function(inds)
56857861 BA	14	computeClusters1(getCoefs(inds), K1)) )
56857861 BA	15	if (length(indices) == K1)
7b13d0c2 BA	16	break
7b13d0c2 BA	17	}
e205f218	18	parallel::stopCluster(cl)
56857861	19	indices #medoids
5c652979 BA	20	}
5c652979 BA	21
0e2dce80	22	# Apply the clustering algorithm (PAM) on a coeffs or distances matrix
56857861	23	computeClusters1 = function(coefs, K1)
e205f218	24	indices[ cluster::pam(coefs, K1, diss=FALSE)$id.med ]
0e2dce80	25
7b13d0c2	26	# Cluster a chunk of series inside one task (~max nb_series_per_chunk)
56857861	27	computeClusters2 = function(medoids, K2, getRefSeries, nb_series_per_chunk)
5c652979	28	{
56857861 BA	29	synchrones = computeSynchrones(medoids, getRefSeries, nb_series_per_chunk)
56857861 BA	30	cluster::pam(computeWerDists(synchrones), K2, diss=TRUE)$medoids
5c652979 BA	31	}
5c652979 BA	32
7b13d0c2	33	# Compute the synchrones curves (sum of clusters elements) from a clustering result
56857861	34	computeSynchrones = function(medoids, getRefSeries, nb_series_per_chunk)
e205f218 BA	35	{
	36	#les getSeries(indices) sont les medoides --> init vect nul pour chacun, puis incr avec les
	37	#courbes (getSeriesForSynchrones) les plus proches... --> au sens de la norme L2 ?
3eef8d3d BA	38	K = nrow(medoids)
	39	synchrones = matrix(0, nrow=K, ncol=ncol(medoids))
	40	counts = rep(0,K)
	41	index = 1
	42	repeat
	43	{
56857861 BA	44	range = (index-1) + seq_len(nb_series_per_chunk)
	45	ref_series = getRefSeries(range)
	46	if (is.null(ref_series))
3eef8d3d BA	47	break
3eef8d3d BA	48	#get medoids indices for this chunk of series
56857861 BA	49	for (i in seq_len(nrow(ref_series)))
	50	{
	51	j = which.min( rowSums( sweep(medoids, 2, series[i,], '-')^2 ) )
	52	synchrones[j,] = synchrones[j,] + series[i,]
	53	counts[j] = counts[j] + 1
	54	}
	55	index = index + nb_series_per_chunk
3eef8d3d BA	56	}
3eef8d3d BA	57	#NOTE: odds for some clusters to be empty? (when series already come from stage 2)
56857861	58	sweep(synchrones, 1, counts, '/')
e205f218	59	}
1c6f223e	60
e205f218	61	# Compute the WER distance between the synchrones curves (in rows)
7b13d0c2	62	computeWerDist = function(curves)
d03c0621	63	{
5c652979 BA	64	if (!require("Rwave", quietly=TRUE))
5c652979 BA	65	stop("Unable to load Rwave library")
7b13d0c2 BA	66	n <- nrow(curves)
7b13d0c2 BA	67	delta <- ncol(curves)
db6fc17d	68	#TODO: automatic tune of all these parameters ? (for other users)
d03c0621	69	nvoice <- 4
7b13d0c2	70	# noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
d7d55bc1 BA	71	noctave = 13
d7d55bc1 BA	72	# 4 here represent 2^5 = 32 half-hours ~ 1 day
db6fc17d BA	73	#NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
	74	scalevector <- 2^(4:(noctave * nvoice) / nvoice) * 2
	75	#condition: ( log2(s0w0/(2pi)) - 1 ) * nvoice + 1.5 >= 1
	76	s0=2
	77	w0=2*pi
	78	scaled=FALSE
	79	s0log = as.integer( (log2( s0w0/(2pi) ) - 1) * nvoice + 1.5 )
	80	totnoct = noctave + as.integer(s0log/nvoice) + 1
	81
	82	# (normalized) observations node with CWT
	83	Xcwt4 <- lapply(seq_len(n), function(i) {
e205f218	84	ts <- scale(ts(curves[i,]), center=TRUE, scale=scaled)
db6fc17d BA	85	totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
	86	ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
	87	#Normalization
	88	sqs <- sqrt(2^(0:(noctavenvoice)/nvoice)s0)
	89	sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
	90	sqres / max(Mod(sqres))
	91	})
3ccd1e39	92
db6fc17d BA	93	Xwer_dist <- matrix(0., n, n)
	94	fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
	95	for (i in 1:(n-1))
1c6f223e	96	{
db6fc17d	97	for (j in (i+1):n)
d03c0621	98	{
0e2dce80	99	#TODO: later, compute CWT here (because not enough storage space for 200k series)
db6fc17d BA	100	# 'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
	101	num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	102	WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
	103	WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
	104	wer2 <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
	105	Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
	106	Xwer_dist[j,i] <- Xwer_dist[i,j]
d03c0621	107	}
1c6f223e	108	}
d03c0621	109	diag(Xwer_dist) <- numeric(n)
c6556868	110	Xwer_dist
1c6f223e	111	}