#Station PQV, training 2008-2013, testing 2013-2015 #side-effect: write predicted PM10 in files with meaningful names buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0) { require(mgcv) #get learning data Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE) Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000 Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim")) Xl = na.omit(Xl) #get testing data Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE) Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000 Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim")) #Xt = na.omit(Xt) #impossible: we need data aligned #prepare formula #formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1") formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1") #compute all weights n = nrow(Xl) weights = as.data.frame(matrix(nrow=n, ncol=0)) #normal #weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"])) #seasons (April --> August = "low pollution") aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August") nb_aprAug = sum(aprAug_indices) weights_sepMar = rep(2. * n/nb_aprAug, n) weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug) weights = cbind(weights, sepMar = weights_sepMar) weights_aprAug = rep(8. * n/nb_aprAug, n) weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug) weights = cbind(weights, aprAug = weights_aprAug) #pollution highPolluted_indices = Xl[,"PM10_j"] > 50 nb_highPolluted = sum(highPolluted_indices) midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30 nb_midPolluted = sum(midPolluted_indices) weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n) weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted weights = cbind(weights, highPollution = weights_polluted) weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n) weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted weights = cbind(weights, lowPollution = weights_notPolluted) #week-end (saturdays and sundays are merged) # weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam") # nb_weekend = sum(weekend_indices) # weights_week = rep(8. * n/(n-nb_weekend), n) # weights_week[weekend_indices] = 2. * n/nb_weekend # weights = cbind(weights, week = weights_week) # weights_weekEnd = rep(2. * n/(n-nb_weekend), n) # weights_weekEnd[weekend_indices] = 8. * n/nb_weekend # weights = cbind(weights, weekEnd = weights_weekEnd) #temperature highTemperature_indices = Xl[,"PM10_j"] > 20 nb_high = sum(highTemperature_indices) midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5 nb_mid = sum(midTemperature_indices) weights_hot = rep(1. * n/(n-nb_mid-nb_high), n) weights_hot[midTemperature_indices] = 2. * n/nb_mid weights_hot[highTemperature_indices] = 7. * n/nb_high weights = cbind(weights, hotTemperature = weights_hot) weights_cold = rep(7. * n/(n-nb_mid-nb_high), n) weights_cold[midTemperature_indices] = 2. * n/nb_mid weights_cold[highTemperature_indices] = 1. * n/nb_high weights = cbind(weights, coldTemperature = weights_cold) #wind eastWindIndices = Xl[,"DirVent"] < 180 westWindIndices = Xl[,"DirVent"] >= 180 nbEast = sum(eastWindIndices) nbWest = sum(westWindIndices) weights_east = rep(8. * n/nbEast, n) weights_east[westWindIndices] = 2. * n/nbWest weights_west = rep(8. * n/nbWest, n) weights_west[eastWindIndices] = 2. * n/nbEast weights = cbind(weights, eastWind = weights_east, westWind = weights_west) #rain rainIndices = Xl[,"Pluie"] > 0 noRainIndices = Xl[,"Pluie"] == 0 nbRain = sum(rainIndices) nbNoRain = sum(noRainIndices) weights_rain = rep(8. * n/nbRain, n) weights_rain[noRainIndices] = 2. * n/nbNoRain weights_noRain = rep(8. * n/nbNoRain, n) weights_noRain[rainIndices] = 2. * n/nbRain weights = cbind(weights, noRain = weights_noRain, rain = weights_rain) #obtain predictions for each weights column m = ncol(weights) data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12] models = as.list(rep(NA, m)) for (i in 1:m) { if (weightType == 0) modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i]))) else if (weightType == 1) { weights_i = rep(1., n) weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0 modelGAM = gam(formula, data = Xl, weights = weights_i) } else if (weightType == 2) modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude) data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt) models[[i]] = serialize(modelGAM, NULL) } write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE) return (models) }