Commit | Line | Data |
---|---|---|
a961f8a1 BA |
1 | #Station PQV, training 2008-2013, testing 2013-2015 |
2 | #side-effect: write predicted PM10 in files with meaningful names | |
3 | ||
4 | buildGAM = function(ifile = "../aggexp/data/PQV2.csv", weightType = 0) | |
5 | { | |
6 | require(mgcv) | |
7 | ||
8 | #get learning data | |
9 | Xl = read.table("data_PQV/train.csv", header=TRUE, sep=";", as.is=TRUE) | |
10 | Xl[,"PAmoy"] = Xl[,"PAmoy"] - 1000 | |
11 | Xl[,"JourJM1"] = factor(Xl[,"JourJM1"], levels=c("sem","ven","sam","dim")) | |
12 | Xl = na.omit(Xl) | |
13 | ||
14 | #get testing data | |
15 | Xt = read.table("data_PQV/test.csv", header=TRUE, sep=";", as.is=TRUE) | |
16 | Xt[,"PAmoy"] = Xt[,"PAmoy"] - 1000 | |
17 | Xt[,"JourJM1"] = factor(Xt[,"JourJM1"], levels = c("sem","ven","sam","dim")) | |
18 | #Xt = na.omit(Xt) #impossible: we need data aligned | |
19 | ||
20 | #prepare formula | |
21 | #formula = as.formula("PM10_j ~ s(PM10_jm1,bs='cr',k=20) + s(Tmoy,bs='cr',k=20) + s(VVmoy,bs='cr',k=20) + s(PAmoy,bs='cr',k=20) + s(GTmax,bs='cr',k=20) + JourJM1") | |
22 | formula = as.formula("PM10_j ~ s(PM10_jm1) + s(Tmoy) + s(VVmoy) + s(PAmoy) + s(GTmax) + JourJM1") | |
23 | ||
24 | #compute all weights | |
25 | n = nrow(Xl) | |
26 | weights = as.data.frame(matrix(nrow=n, ncol=0)) | |
27 | ||
28 | #normal | |
29 | #weights = cbind(weights, normal = sqrt(Xl[, "PM10_j"])) | |
30 | ||
31 | #seasons (April --> August = "low pollution") | |
32 | aprAug_indices = months(as.Date(Xl[,"Date"])) %in% c("April","May","June","July","August") | |
33 | nb_aprAug = sum(aprAug_indices) | |
34 | weights_sepMar = rep(2. * n/nb_aprAug, n) | |
35 | weights_sepMar[!aprAug_indices] = 8. * n/(n-nb_aprAug) | |
36 | weights = cbind(weights, sepMar = weights_sepMar) | |
37 | weights_aprAug = rep(8. * n/nb_aprAug, n) | |
38 | weights_aprAug[!aprAug_indices] = 2. * n/(n-nb_aprAug) | |
39 | weights = cbind(weights, aprAug = weights_aprAug) | |
40 | ||
41 | #pollution | |
42 | highPolluted_indices = Xl[,"PM10_j"] > 50 | |
43 | nb_highPolluted = sum(highPolluted_indices) | |
44 | midPolluted_indices = !highPolluted_indices & Xl[,"PM10_j"] > 30 | |
45 | nb_midPolluted = sum(midPolluted_indices) | |
46 | weights_polluted = rep(1. * n/(n-nb_highPolluted-nb_midPolluted), n) | |
47 | weights_polluted[midPolluted_indices] = 3. * n/nb_midPolluted | |
48 | weights_polluted[highPolluted_indices] = 6. * n/nb_highPolluted | |
49 | weights = cbind(weights, highPollution = weights_polluted) | |
50 | weights_notPolluted = rep(8. * n/(n-nb_highPolluted-nb_midPolluted), n) | |
51 | weights_notPolluted[midPolluted_indices] = 1. * n/nb_midPolluted | |
52 | weights_notPolluted[highPolluted_indices] = 1. * n/nb_highPolluted | |
53 | weights = cbind(weights, lowPollution = weights_notPolluted) | |
54 | ||
55 | #week-end (saturdays and sundays are merged) | |
56 | # weekend_indices = Xl[,"JourJM1"] %in% c("ven", "sam") | |
57 | # nb_weekend = sum(weekend_indices) | |
58 | # weights_week = rep(8. * n/(n-nb_weekend), n) | |
59 | # weights_week[weekend_indices] = 2. * n/nb_weekend | |
60 | # weights = cbind(weights, week = weights_week) | |
61 | # weights_weekEnd = rep(2. * n/(n-nb_weekend), n) | |
62 | # weights_weekEnd[weekend_indices] = 8. * n/nb_weekend | |
63 | # weights = cbind(weights, weekEnd = weights_weekEnd) | |
64 | ||
65 | #temperature | |
66 | highTemperature_indices = Xl[,"PM10_j"] > 20 | |
67 | nb_high = sum(highTemperature_indices) | |
68 | midTemperature_indices = !highTemperature_indices & Xl[,"Tmoy"] > 5 | |
69 | nb_mid = sum(midTemperature_indices) | |
70 | weights_hot = rep(1. * n/(n-nb_mid-nb_high), n) | |
71 | weights_hot[midTemperature_indices] = 2. * n/nb_mid | |
72 | weights_hot[highTemperature_indices] = 7. * n/nb_high | |
73 | weights = cbind(weights, hotTemperature = weights_hot) | |
74 | weights_cold = rep(7. * n/(n-nb_mid-nb_high), n) | |
75 | weights_cold[midTemperature_indices] = 2. * n/nb_mid | |
76 | weights_cold[highTemperature_indices] = 1. * n/nb_high | |
77 | weights = cbind(weights, coldTemperature = weights_cold) | |
78 | ||
79 | #wind | |
80 | eastWindIndices = Xl[,"DirVent"] < 180 | |
81 | westWindIndices = Xl[,"DirVent"] >= 180 | |
82 | nbEast = sum(eastWindIndices) | |
83 | nbWest = sum(westWindIndices) | |
84 | weights_east = rep(8. * n/nbEast, n) | |
85 | weights_east[westWindIndices] = 2. * n/nbWest | |
86 | weights_west = rep(8. * n/nbWest, n) | |
87 | weights_west[eastWindIndices] = 2. * n/nbEast | |
88 | weights = cbind(weights, eastWind = weights_east, westWind = weights_west) | |
89 | ||
90 | #rain | |
91 | rainIndices = Xl[,"Pluie"] > 0 | |
92 | noRainIndices = Xl[,"Pluie"] == 0 | |
93 | nbRain = sum(rainIndices) | |
94 | nbNoRain = sum(noRainIndices) | |
95 | weights_rain = rep(8. * n/nbRain, n) | |
96 | weights_rain[noRainIndices] = 2. * n/nbNoRain | |
97 | weights_noRain = rep(8. * n/nbNoRain, n) | |
98 | weights_noRain[rainIndices] = 2. * n/nbRain | |
99 | weights = cbind(weights, noRain = weights_noRain, rain = weights_rain) | |
100 | ||
101 | #obtain predictions for each weights column | |
102 | m = ncol(weights) | |
103 | data_PQV = read.table(ifile, header=TRUE, sep=";", as.is=TRUE)[,1:12] | |
104 | models = as.list(rep(NA, m)) | |
105 | for (i in 1:m) | |
106 | { | |
107 | if (weightType == 0) | |
108 | modelGAM = gam(formula, data = subset(Xl, subset = weights[,i] == max(weights[,i]))) | |
109 | else if (weightType == 1) | |
110 | { | |
111 | weights_i = rep(1., n) | |
112 | weights_i[weights[,i] != max(weights[,i])] = .Machine$double.eps #almost 0 | |
113 | modelGAM = gam(formula, data = Xl, weights = weights_i) | |
114 | } | |
115 | else if (weightType == 2) | |
116 | modelGAM = gam(formula, data = Xl, weights = weights[,i]) #, na.action=na.exclude) | |
117 | data_PQV[,paste("GAM_",names(weights)[i],sep="")] = predict(modelGAM, newdata = Xt) | |
118 | models[[i]] = serialize(modelGAM, NULL) | |
119 | } | |
120 | write.table(data_PQV, ifile, quote=FALSE, sep=";", row.names=FALSE) | |
121 | return (models) | |
122 | } |