Initial commit
[aggexp.git] / scripts / buildexp / build_GAM.R
1 #Station PQV, training 2008-2013, testing 2013-2015
2 #side-effect: write predicted PM10 in files with meaningful names
3
4 buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0)
5 {
6 require(mgcv)
7
8 #get learning data
9 Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE)
10 Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000
11 Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim"))
12 Xl = na.omit(Xl)
13
14 #get testing data
15 Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE)
16 Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000
17 Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim"))
18 #Xt = na.omit(Xt) #impossible: we need data aligned
19
20 #prepare formula
21 #formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1")
22 formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1")
23
24 #compute all weights
25 n = nrow(Xl)
26 weights = as.data.frame(matrix(nrow=n, ncol=0))
27
28 #normal
29 #weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"]))
30
31 #seasons (April --> August = "low pollution")
32 aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August")
33 nb_aprAug = sum(aprAug_indices)
34 weights_sepMar = rep(2. * n/nb_aprAug, n)
35 weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug)
36 weights = cbind(weights, sepMar = weights_sepMar)
37 weights_aprAug = rep(8. * n/nb_aprAug, n)
38 weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug)
39 weights = cbind(weights, aprAug = weights_aprAug)
40
41 #pollution
42 highPolluted_indices = Xl[,"PM10_j"] > 50
43 nb_highPolluted = sum(highPolluted_indices)
44 midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30
45 nb_midPolluted = sum(midPolluted_indices)
46 weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n)
47 weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted
48 weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted
49 weights = cbind(weights, highPollution = weights_polluted)
50 weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n)
51 weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted
52 weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted
53 weights = cbind(weights, lowPollution = weights_notPolluted)
54
55 #week-end (saturdays and sundays are merged)
56 # weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam")
57 # nb_weekend = sum(weekend_indices)
58 # weights_week = rep(8. * n/(n-nb_weekend), n)
59 # weights_week[weekend_indices] = 2. * n/nb_weekend
60 # weights = cbind(weights, week = weights_week)
61 # weights_weekEnd = rep(2. * n/(n-nb_weekend), n)
62 # weights_weekEnd[weekend_indices] = 8. * n/nb_weekend
63 # weights = cbind(weights, weekEnd = weights_weekEnd)
64
65 #temperature
66 highTemperature_indices = Xl[,"PM10_j"] > 20
67 nb_high = sum(highTemperature_indices)
68 midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5
69 nb_mid = sum(midTemperature_indices)
70 weights_hot = rep(1. * n/(n-nb_mid-nb_high), n)
71 weights_hot[midTemperature_indices] = 2. * n/nb_mid
72 weights_hot[highTemperature_indices] = 7. * n/nb_high
73 weights = cbind(weights, hotTemperature = weights_hot)
74 weights_cold = rep(7. * n/(n-nb_mid-nb_high), n)
75 weights_cold[midTemperature_indices] = 2. * n/nb_mid
76 weights_cold[highTemperature_indices] = 1. * n/nb_high
77 weights = cbind(weights, coldTemperature = weights_cold)
78
79 #wind
80 eastWindIndices = Xl[,"DirVent"] < 180
81 westWindIndices = Xl[,"DirVent"] >= 180
82 nbEast = sum(eastWindIndices)
83 nbWest = sum(westWindIndices)
84 weights_east = rep(8. * n/nbEast, n)
85 weights_east[westWindIndices] = 2. * n/nbWest
86 weights_west = rep(8. * n/nbWest, n)
87 weights_west[eastWindIndices] = 2. * n/nbEast
88 weights = cbind(weights, eastWind = weights_east, westWind = weights_west)
89
90 #rain
91 rainIndices = Xl[,"Pluie"] > 0
92 noRainIndices = Xl[,"Pluie"] == 0
93 nbRain = sum(rainIndices)
94 nbNoRain = sum(noRainIndices)
95 weights_rain = rep(8. * n/nbRain, n)
96 weights_rain[noRainIndices] = 2. * n/nbNoRain
97 weights_noRain = rep(8. * n/nbNoRain, n)
98 weights_noRain[rainIndices] = 2. * n/nbRain
99 weights = cbind(weights, noRain = weights_noRain, rain = weights_rain)
100
101 #obtain predictions for each weights column
102 m = ncol(weights)
103 data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12]
104 models = as.list(rep(NA, m))
105 for (i in 1:m)
106 {
107 if (weightType == 0)
108 modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i])))
109 else if (weightType == 1)
110 {
111 weights_i = rep(1., n)
112 weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0
113 modelGAM = gam(formula, data = Xl, weights = weights_i)
114 }
115 else if (weightType == 2)
116 modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude)
117 data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt)
118 models[[i]] = serialize(modelGAM, NULL)
119 }
120 write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE)
121 return (models)
122 }