[aggexp.git] / scripts / buildexp / build_GAM.R

#Station PQV, training 2008-2013, testing 2013-2015
#side-effect: write predicted PM10 in files with meaningful names

buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0)
{
	require(mgcv)

	#get learning data
	Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE)
	Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000
	Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim"))
	Xl = na.omit(Xl)

	#get testing data
	Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE)
	Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000
	Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim"))
	#Xt = na.omit(Xt) #impossible: we need data aligned

	#prepare formula
	#formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1")
	formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1")

	#compute all weights
	n = nrow(Xl)
	weights = as.data.frame(matrix(nrow=n, ncol=0))

	#normal
	#weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"]))

	#seasons (April --> August = "low pollution")
	aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August")
	nb_aprAug = sum(aprAug_indices)
	weights_sepMar = rep(2. * n/nb_aprAug, n)
	weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug)
	weights = cbind(weights, sepMar = weights_sepMar)
	weights_aprAug = rep(8. * n/nb_aprAug, n)
	weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug)
	weights = cbind(weights, aprAug = weights_aprAug)

	#pollution
	highPolluted_indices = Xl[,"PM10_j"] > 50
	nb_highPolluted = sum(highPolluted_indices)
	midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30
	nb_midPolluted = sum(midPolluted_indices)
	weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n)
	weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted
	weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted
	weights = cbind(weights, highPollution = weights_polluted)
	weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n)
	weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted
	weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted
	weights = cbind(weights, lowPollution = weights_notPolluted)

	#week-end (saturdays and sundays are merged)
#	weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam")
#	nb_weekend = sum(weekend_indices)
#	weights_week = rep(8. * n/(n-nb_weekend), n)
#	weights_week[weekend_indices] = 2. * n/nb_weekend
#	weights = cbind(weights, week = weights_week)
#	weights_weekEnd = rep(2. * n/(n-nb_weekend), n)
#	weights_weekEnd[weekend_indices] = 8. * n/nb_weekend
#	weights = cbind(weights, weekEnd = weights_weekEnd)

	#temperature
	highTemperature_indices = Xl[,"PM10_j"] > 20
	nb_high = sum(highTemperature_indices)
	midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5
	nb_mid = sum(midTemperature_indices)
	weights_hot = rep(1. * n/(n-nb_mid-nb_high), n)
	weights_hot[midTemperature_indices] = 2. * n/nb_mid
	weights_hot[highTemperature_indices] = 7. * n/nb_high
	weights = cbind(weights, hotTemperature = weights_hot)
	weights_cold = rep(7. * n/(n-nb_mid-nb_high), n)
	weights_cold[midTemperature_indices] = 2. * n/nb_mid
	weights_cold[highTemperature_indices] = 1. * n/nb_high
	weights = cbind(weights, coldTemperature = weights_cold)

	#wind
	eastWindIndices = Xl[,"DirVent"] < 180
	westWindIndices = Xl[,"DirVent"] >= 180
	nbEast = sum(eastWindIndices)
	nbWest = sum(westWindIndices)
	weights_east = rep(8. * n/nbEast, n)
	weights_east[westWindIndices] = 2. * n/nbWest
	weights_west = rep(8. * n/nbWest, n)
	weights_west[eastWindIndices] = 2. * n/nbEast
	weights = cbind(weights, eastWind = weights_east, westWind = weights_west)

	#rain
	rainIndices = Xl[,"Pluie"] > 0
	noRainIndices = Xl[,"Pluie"] == 0
	nbRain = sum(rainIndices)
	nbNoRain = sum(noRainIndices)
	weights_rain = rep(8. * n/nbRain, n)
	weights_rain[noRainIndices] = 2. * n/nbNoRain
	weights_noRain = rep(8. * n/nbNoRain, n)
	weights_noRain[rainIndices] = 2. * n/nbRain
	weights = cbind(weights, noRain = weights_noRain, rain = weights_rain)

	#obtain predictions for each weights column
	m = ncol(weights)
	data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12]
	models = as.list(rep(NA, m))
	for (i in 1:m)
	{
		if (weightType == 0)
			modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i])))
		else if (weightType == 1)
		{
			weights_i = rep(1., n)
			weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0
			modelGAM = gam(formula, data = Xl, weights = weights_i)
		}
		else if (weightType == 2)
			modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude)
		data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt)
		models[[i]] = serialize(modelGAM, NULL)
	}
	write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE)
	return (models)
}
Commit	Line	Data
	1	#Station PQV, training 2008-2013, testing 2013-2015
	2	#side-effect: write predicted PM10 in files with meaningful names
	3
	4	buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0)
	5	{
	6	require(mgcv)
	7
	8	#get learning data
	9	Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE)
	10	Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000
	11	Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim"))
	12	Xl = na.omit(Xl)
	13
	14	#get testing data
	15	Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE)
	16	Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000
	17	Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim"))
	18	#Xt = na.omit(Xt) #impossible: we need data aligned
	19
	20	#prepare formula
	21	#formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1")
	22	formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1")
	23
	24	#compute all weights
	25	n = nrow(Xl)
	26	weights = as.data.frame(matrix(nrow=n, ncol=0))
	27
	28	#normal
	29	#weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"]))
	30
	31	#seasons (April --> August = "low pollution")
	32	aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August")
	33	nb_aprAug = sum(aprAug_indices)
	34	weights_sepMar = rep(2. * n/nb_aprAug, n)
	35	weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug)
	36	weights = cbind(weights, sepMar = weights_sepMar)
	37	weights_aprAug = rep(8. * n/nb_aprAug, n)
	38	weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug)
	39	weights = cbind(weights, aprAug = weights_aprAug)
	40
	41	#pollution
	42	highPolluted_indices = Xl[,"PM10_j"] > 50
	43	nb_highPolluted = sum(highPolluted_indices)
	44	midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30
	45	nb_midPolluted = sum(midPolluted_indices)
	46	weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n)
	47	weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted
	48	weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted
	49	weights = cbind(weights, highPollution = weights_polluted)
	50	weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n)
	51	weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted
	52	weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted
	53	weights = cbind(weights, lowPollution = weights_notPolluted)
	54
	55	#week-end (saturdays and sundays are merged)
	56	# weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam")
	57	# nb_weekend = sum(weekend_indices)
	58	# weights_week = rep(8. * n/(n-nb_weekend), n)
	59	# weights_week[weekend_indices] = 2. * n/nb_weekend
	60	# weights = cbind(weights, week = weights_week)
	61	# weights_weekEnd = rep(2. * n/(n-nb_weekend), n)
	62	# weights_weekEnd[weekend_indices] = 8. * n/nb_weekend
	63	# weights = cbind(weights, weekEnd = weights_weekEnd)
	64
	65	#temperature
	66	highTemperature_indices = Xl[,"PM10_j"] > 20
	67	nb_high = sum(highTemperature_indices)
	68	midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5
	69	nb_mid = sum(midTemperature_indices)
	70	weights_hot = rep(1. * n/(n-nb_mid-nb_high), n)
	71	weights_hot[midTemperature_indices] = 2. * n/nb_mid
	72	weights_hot[highTemperature_indices] = 7. * n/nb_high
	73	weights = cbind(weights, hotTemperature = weights_hot)
	74	weights_cold = rep(7. * n/(n-nb_mid-nb_high), n)
	75	weights_cold[midTemperature_indices] = 2. * n/nb_mid
	76	weights_cold[highTemperature_indices] = 1. * n/nb_high
	77	weights = cbind(weights, coldTemperature = weights_cold)
	78
	79	#wind
	80	eastWindIndices = Xl[,"DirVent"] < 180
	81	westWindIndices = Xl[,"DirVent"] >= 180
	82	nbEast = sum(eastWindIndices)
	83	nbWest = sum(westWindIndices)
	84	weights_east = rep(8. * n/nbEast, n)
	85	weights_east[westWindIndices] = 2. * n/nbWest
	86	weights_west = rep(8. * n/nbWest, n)
	87	weights_west[eastWindIndices] = 2. * n/nbEast
	88	weights = cbind(weights, eastWind = weights_east, westWind = weights_west)
	89
	90	#rain
	91	rainIndices = Xl[,"Pluie"] > 0
	92	noRainIndices = Xl[,"Pluie"] == 0
	93	nbRain = sum(rainIndices)
	94	nbNoRain = sum(noRainIndices)
	95	weights_rain = rep(8. * n/nbRain, n)
	96	weights_rain[noRainIndices] = 2. * n/nbNoRain
	97	weights_noRain = rep(8. * n/nbNoRain, n)
	98	weights_noRain[rainIndices] = 2. * n/nbRain
	99	weights = cbind(weights, noRain = weights_noRain, rain = weights_rain)
	100
	101	#obtain predictions for each weights column
	102	m = ncol(weights)
	103	data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12]
	104	models = as.list(rep(NA, m))
	105	for (i in 1:m)
	106	{
	107	if (weightType == 0)
	108	modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i])))
	109	else if (weightType == 1)
	110	{
	111	weights_i = rep(1., n)
	112	weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0
	113	modelGAM = gam(formula, data = Xl, weights = weights_i)
	114	}
	115	else if (weightType == 2)
	116	modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude)
	117	data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt)
	118	models[[i]] = serialize(modelGAM, NULL)
	119	}
	120	write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE)
	121	return (models)
	122	}