| 1 | #Station PQV, training 2008-2013, testing 2013-2015 |
| 2 | #side-effect: write predicted PM10 in files with meaningful names |
| 3 | |
| 4 | buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0) |
| 5 | { |
| 6 | require(mgcv) |
| 7 | |
| 8 | #get learning data |
| 9 | Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE) |
| 10 | Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000 |
| 11 | Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim")) |
| 12 | Xl = na.omit(Xl) |
| 13 | |
| 14 | #get testing data |
| 15 | Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE) |
| 16 | Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000 |
| 17 | Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim")) |
| 18 | #Xt = na.omit(Xt) #impossible: we need data aligned |
| 19 | |
| 20 | #prepare formula |
| 21 | #formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1") |
| 22 | formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1") |
| 23 | |
| 24 | #compute all weights |
| 25 | n = nrow(Xl) |
| 26 | weights = as.data.frame(matrix(nrow=n, ncol=0)) |
| 27 | |
| 28 | #normal |
| 29 | #weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"])) |
| 30 | |
| 31 | #seasons (April --> August = "low pollution") |
| 32 | aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August") |
| 33 | nb_aprAug = sum(aprAug_indices) |
| 34 | weights_sepMar = rep(2. * n/nb_aprAug, n) |
| 35 | weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug) |
| 36 | weights = cbind(weights, sepMar = weights_sepMar) |
| 37 | weights_aprAug = rep(8. * n/nb_aprAug, n) |
| 38 | weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug) |
| 39 | weights = cbind(weights, aprAug = weights_aprAug) |
| 40 | |
| 41 | #pollution |
| 42 | highPolluted_indices = Xl[,"PM10_j"] > 50 |
| 43 | nb_highPolluted = sum(highPolluted_indices) |
| 44 | midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30 |
| 45 | nb_midPolluted = sum(midPolluted_indices) |
| 46 | weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n) |
| 47 | weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted |
| 48 | weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted |
| 49 | weights = cbind(weights, highPollution = weights_polluted) |
| 50 | weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n) |
| 51 | weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted |
| 52 | weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted |
| 53 | weights = cbind(weights, lowPollution = weights_notPolluted) |
| 54 | |
| 55 | #week-end (saturdays and sundays are merged) |
| 56 | # weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam") |
| 57 | # nb_weekend = sum(weekend_indices) |
| 58 | # weights_week = rep(8. * n/(n-nb_weekend), n) |
| 59 | # weights_week[weekend_indices] = 2. * n/nb_weekend |
| 60 | # weights = cbind(weights, week = weights_week) |
| 61 | # weights_weekEnd = rep(2. * n/(n-nb_weekend), n) |
| 62 | # weights_weekEnd[weekend_indices] = 8. * n/nb_weekend |
| 63 | # weights = cbind(weights, weekEnd = weights_weekEnd) |
| 64 | |
| 65 | #temperature |
| 66 | highTemperature_indices = Xl[,"PM10_j"] > 20 |
| 67 | nb_high = sum(highTemperature_indices) |
| 68 | midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5 |
| 69 | nb_mid = sum(midTemperature_indices) |
| 70 | weights_hot = rep(1. * n/(n-nb_mid-nb_high), n) |
| 71 | weights_hot[midTemperature_indices] = 2. * n/nb_mid |
| 72 | weights_hot[highTemperature_indices] = 7. * n/nb_high |
| 73 | weights = cbind(weights, hotTemperature = weights_hot) |
| 74 | weights_cold = rep(7. * n/(n-nb_mid-nb_high), n) |
| 75 | weights_cold[midTemperature_indices] = 2. * n/nb_mid |
| 76 | weights_cold[highTemperature_indices] = 1. * n/nb_high |
| 77 | weights = cbind(weights, coldTemperature = weights_cold) |
| 78 | |
| 79 | #wind |
| 80 | eastWindIndices = Xl[,"DirVent"] < 180 |
| 81 | westWindIndices = Xl[,"DirVent"] >= 180 |
| 82 | nbEast = sum(eastWindIndices) |
| 83 | nbWest = sum(westWindIndices) |
| 84 | weights_east = rep(8. * n/nbEast, n) |
| 85 | weights_east[westWindIndices] = 2. * n/nbWest |
| 86 | weights_west = rep(8. * n/nbWest, n) |
| 87 | weights_west[eastWindIndices] = 2. * n/nbEast |
| 88 | weights = cbind(weights, eastWind = weights_east, westWind = weights_west) |
| 89 | |
| 90 | #rain |
| 91 | rainIndices = Xl[,"Pluie"] > 0 |
| 92 | noRainIndices = Xl[,"Pluie"] == 0 |
| 93 | nbRain = sum(rainIndices) |
| 94 | nbNoRain = sum(noRainIndices) |
| 95 | weights_rain = rep(8. * n/nbRain, n) |
| 96 | weights_rain[noRainIndices] = 2. * n/nbNoRain |
| 97 | weights_noRain = rep(8. * n/nbNoRain, n) |
| 98 | weights_noRain[rainIndices] = 2. * n/nbRain |
| 99 | weights = cbind(weights, noRain = weights_noRain, rain = weights_rain) |
| 100 | |
| 101 | #obtain predictions for each weights column |
| 102 | m = ncol(weights) |
| 103 | data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12] |
| 104 | models = as.list(rep(NA, m)) |
| 105 | for (i in 1:m) |
| 106 | { |
| 107 | if (weightType == 0) |
| 108 | modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i]))) |
| 109 | else if (weightType == 1) |
| 110 | { |
| 111 | weights_i = rep(1., n) |
| 112 | weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0 |
| 113 | modelGAM = gam(formula, data = Xl, weights = weights_i) |
| 114 | } |
| 115 | else if (weightType == 2) |
| 116 | modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude) |
| 117 | data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt) |
| 118 | models[[i]] = serialize(modelGAM, NULL) |
| 119 | } |
| 120 | write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE) |
| 121 | return (models) |
| 122 | } |