#Station PQV, training 2008-2013, testing 2013-2015
#side-effect: write predicted PM10 in files with meaningful names

buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0)
{
	require(mgcv)

	#get learning data
	Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE)
	Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000
	Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim"))
	Xl = na.omit(Xl)

	#get testing data
	Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE)
	Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000
	Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim"))
	#Xt = na.omit(Xt) #impossible: we need data aligned

	#prepare formula
	#formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1")
	formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1")

	#compute all weights
	n = nrow(Xl)
	weights = as.data.frame(matrix(nrow=n, ncol=0))

	#normal
	#weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"]))

	#seasons (April --> August = "low pollution")
	aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August")
	nb_aprAug = sum(aprAug_indices)
	weights_sepMar = rep(2. * n/nb_aprAug, n)
	weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug)
	weights = cbind(weights, sepMar = weights_sepMar)
	weights_aprAug = rep(8. * n/nb_aprAug, n)
	weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug)
	weights = cbind(weights, aprAug = weights_aprAug)

	#pollution
	highPolluted_indices = Xl[,"PM10_j"] > 50
	nb_highPolluted = sum(highPolluted_indices)
	midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30
	nb_midPolluted = sum(midPolluted_indices)
	weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n)
	weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted
	weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted
	weights = cbind(weights, highPollution = weights_polluted)
	weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n)
	weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted
	weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted
	weights = cbind(weights, lowPollution = weights_notPolluted)

	#week-end (saturdays and sundays are merged)
#	weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam")
#	nb_weekend = sum(weekend_indices)
#	weights_week = rep(8. * n/(n-nb_weekend), n)
#	weights_week[weekend_indices] = 2. * n/nb_weekend
#	weights = cbind(weights, week = weights_week)
#	weights_weekEnd = rep(2. * n/(n-nb_weekend), n)
#	weights_weekEnd[weekend_indices] = 8. * n/nb_weekend
#	weights = cbind(weights, weekEnd = weights_weekEnd)

	#temperature
	highTemperature_indices = Xl[,"PM10_j"] > 20
	nb_high = sum(highTemperature_indices)
	midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5
	nb_mid = sum(midTemperature_indices)
	weights_hot = rep(1. * n/(n-nb_mid-nb_high), n)
	weights_hot[midTemperature_indices] = 2. * n/nb_mid
	weights_hot[highTemperature_indices] = 7. * n/nb_high
	weights = cbind(weights, hotTemperature = weights_hot)
	weights_cold = rep(7. * n/(n-nb_mid-nb_high), n)
	weights_cold[midTemperature_indices] = 2. * n/nb_mid
	weights_cold[highTemperature_indices] = 1. * n/nb_high
	weights = cbind(weights, coldTemperature = weights_cold)

	#wind
	eastWindIndices = Xl[,"DirVent"] < 180
	westWindIndices = Xl[,"DirVent"] >= 180
	nbEast = sum(eastWindIndices)
	nbWest = sum(westWindIndices)
	weights_east = rep(8. * n/nbEast, n)
	weights_east[westWindIndices] = 2. * n/nbWest
	weights_west = rep(8. * n/nbWest, n)
	weights_west[eastWindIndices] = 2. * n/nbEast
	weights = cbind(weights, eastWind = weights_east, westWind = weights_west)

	#rain
	rainIndices = Xl[,"Pluie"] > 0
	noRainIndices = Xl[,"Pluie"] == 0
	nbRain = sum(rainIndices)
	nbNoRain = sum(noRainIndices)
	weights_rain = rep(8. * n/nbRain, n)
	weights_rain[noRainIndices] = 2. * n/nbNoRain
	weights_noRain = rep(8. * n/nbNoRain, n)
	weights_noRain[rainIndices] = 2. * n/nbRain
	weights = cbind(weights, noRain = weights_noRain, rain = weights_rain)

	#obtain predictions for each weights column
	m = ncol(weights)
	data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12]
	models = as.list(rep(NA, m))
	for (i in 1:m)
	{
		if (weightType == 0)
			modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i])))
		else if (weightType == 1)
		{
			weights_i = rep(1., n)
			weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0
			modelGAM = gam(formula, data = Xl, weights = weights_i)
		}
		else if (weightType == 2)
			modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude)
		data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt)
		models[[i]] = serialize(modelGAM, NULL)
	}
	write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE)
	return (models)
}