f35da6490c204e0a57dc0de4a0f93cf39b87bd45
[epclust.git] / data / preprocessing / serialize.c
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2 #include <inttypes.h>
3 #include <cgds/Vector.h>
4 #include <string.h>
5 #include <math.h>
6 #include <float.h>
7
8 char readInt(FILE* stream, int64_t* integer)
9 {
10 *integer = 0;
11 char curChar = fgetc(stream);
12 int sign = (curChar == '-' ? -1 : 1);
13 while (curChar < '0' || curChar > '9')
14 curChar = fgetc(stream);
15 ungetc(curChar, stream);
16 while ((curChar = fgetc(stream)) >= '0' && curChar <= '9')
17 *integer = 10 * (*integer) + (int64_t) (curChar - '0');
18 (*integer) *= sign;
19 return curChar;
20 }
21
22 char readReal(FILE* stream, float* real)
23 {
24 int64_t integerPart;
25 char nextChar = readInt(stream, &integerPart);
26 int64_t fractionalPart = 0;
27 int countZeros = 0;
28 if (nextChar == '.')
29 {
30 //need to count zeros
31 while ((nextChar = fgetc(stream)) == '0')
32 countZeros++;
33 if (nextChar >= '1' && nextChar <= '9')
34 {
35 ungetc(nextChar, stream);
36 nextChar = readInt(stream, &fractionalPart);
37 }
38 }
39 int64_t exponent = 0;
40 if (nextChar == 'e' || nextChar == 'E')
41 nextChar = readInt(stream, &exponent);
42 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
43 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
44 * pow(10,exponent);
45 return nextChar;
46 }
47
48 // Parse a line into integer+float (ID, raw power)
49 static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower)
50 {
51 char nextChar;
52 int position = 1;
53 while (1)
54 {
55 if (position == posID)
56 {
57 int64_t ID_on64bits;
58 nextChar = readInt(ifile, &ID_on64bits);
59 *ID = (uint32_t)ID_on64bits;
60 }
61 else if (position == posPower)
62 {
63 float power = FLT_MAX; //"NA"
64 nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?!
65 *rawPower = (float) power;
66 }
67 else
68 //erase the comma (and skip field then)
69 nextChar = fgetc(ifile);
70
71 //continue until next comma (or line end or file end)
72 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',')
73 nextChar = fgetc(ifile);
74 position++;
75
76 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
77 {
78 // skip all potential line feeds
79 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
80 nextChar = fgetc(ifile);
81 if (!feof(ifile))
82 ungetc(nextChar, ifile);
83 break;
84 }
85 }
86 }
87
88 // Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
89 void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
90 {
91 // Use the header to know positions of ID and rawPower
92 FILE* ifile = fopen(ifileName, "r");
93 uint32_t headerShift = 0;
94 char curChar;
95 Vector* header = vector_new(char);
96 do
97 {
98 curChar = fgetc(ifile);
99 headerShift++;
100 if (curChar == '\n' || curChar == '\r')
101 {
102 // Flush all potential other line feeds
103 while (curChar == '\n' || curChar == '\r')
104 curChar = fgetc(ifile);
105 ungetc(curChar, ifile);
106 break;
107 }
108 vector_push(header, curChar);
109 }
110 while (1);
111 char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char));
112 VectorIterator* it = vector_get_iterator(header);
113 int index = 0;
114 while (vectorI_has_data(it))
115 {
116 vectorI_get(it, headerString[index]);
117 vectorI_move_next(it);
118 index++;
119 }
120 vectorI_destroy(it);
121 headerString[index] = 0;
122 vector_destroy(header);
123 int position = 1, posID = 0, posPower = 0;
124 char* columnDescriptor = strtok(headerString, ",");
125 while (columnDescriptor != NULL)
126 {
127 if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id"))
128 posID = position;
129 else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE"))
130 posPower = position;
131 position++;
132 columnDescriptor = strtok(NULL, ",");
133 }
134 free(headerString);
135
136 // Estimate tsLength with a scan of the 3 first series
137 uint32_t ID=0, lastID=0, refTsLength=0;
138 float rawPower = 0.;
139 scan_line(ifile, posID, &ID, posPower, &rawPower);
140 //'sl' = sample lengths (short because a lot of comparisons then)
141 uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t));
142 for (int i=0; i<3; i++)
143 {
144 lastID = ID;
145 while (ID == lastID)
146 {
147 sl[i]++;
148 scan_line(ifile, posID, &ID, posPower, &rawPower);
149 }
150 }
151 if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2])
152 refTsLength = sl[2];
153 else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0])
154 refTsLength = sl[0];
155 else
156 refTsLength = sl[1];
157 free(sl);
158 //go back at the beginning of the first series (ready to read '\n'...)
159 fseek(ifile, headerShift-1, SEEK_SET);
160
161 // output file to write time-series sequentially, CSV format.
162 FILE* ofile = fopen(ofileName, "w");
163
164 // process one client (ID in first column) at a time
165 uint64_t processedLines = 0; //execution trace
166 uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
167 uint32_t mismatchLengthCount=0;
168 float tsBuffer[refTsLength];
169 lastID = 0;
170 while (!feof(ifile))
171 {
172 // next element to read always start with a digit
173 do
174 curChar = fgetc(ifile);
175 while (!feof(ifile) && (curChar < '0' || curChar > '9'));
176 if (feof(ifile))
177 break;
178 ungetc(curChar, ifile);
179
180 // read line
181 scan_line(ifile, posID, &ID, posPower, &rawPower);
182 if (ID != lastID)
183 {
184 //just starting a new time-series: must process the last one (if there is a last one !)
185 if (lastID > 0)
186 {
187 if (tsLength == refTsLength)
188 {
189 for (int i=0; i<tsLength; i++)
190 {
191 char* format = i<tsLength-1 ? "%g," : "%g";
192 fprintf(ofile, format, tsBuffer[i]);
193 }
194 fprintf(ofile, "\n");
195 if (nbItems > 0 && ++seriesCount >= nbItems)
196 break;
197 }
198 //if something wrong happened, skip series
199 else
200 {
201 skippedSeriesCount++;
202 if (tsLength != refTsLength)
203 mismatchLengthCount++;
204 }
205 }
206
207 // reinitialize flags
208 tsLength = 0;
209 lastID = ID;
210 }
211
212 //We cannot write more than refTsLength values
213 if (tsLength < refTsLength)
214 tsBuffer[tsLength++] = rawPower;
215
216 if ((++processedLines) % 1000000 == 0)
217 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
218 }
219
220 if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems))
221 {
222 // flush last time-series if all conditions are met
223 for (int i=0; i<tsLength; i++)
224 {
225 char* format = i<tsLength-1 ? "%g," : "%g";
226 fprintf(ofile, format, tsBuffer[i]);
227 }
228 fprintf(ofile, "\n");
229 seriesCount++;
230 }
231 else if (nbItems <= 0 || seriesCount < nbItems)
232 {
233 if (tsLength != refTsLength)
234 mismatchLengthCount++;
235 }
236
237 // finally print some statistics
238 if (seriesCount < nbItems)
239 fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount);
240 fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount);
241
242 fclose(ifile);
243 fclose(ofile);
244 }