Commit | Line | Data |
---|---|---|
81923e5c BA |
1 | #include "TimeSeries/serialize.h" |
2 | #define __STDC_FORMAT_MACROS | |
3 | #include <inttypes.h> | |
4 | #include <math.h> | |
5 | #include "Util/types.h" | |
6 | #include "Util/utils.h" | |
7 | #include <cds/Vector.h> | |
8 | #include <string.h> | |
9 | ||
10 | // parse a line into two integers (ID, raw power) | |
11 | static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, uint32_t* rawPower) | |
12 | { | |
13 | char nextChar; | |
14 | int position = 1; | |
15 | while (1) | |
16 | { | |
17 | if (position == posID) | |
18 | { | |
19 | int64_t ID_on64bits; | |
20 | nextChar = readInt(ifile, &ID_on64bits); | |
21 | *ID = (uint32_t)ID_on64bits; | |
22 | } | |
23 | else if (position == posPower) | |
24 | { | |
25 | Real untruncatedPower; | |
26 | nextChar = readReal(ifile, &untruncatedPower); | |
27 | if (untruncatedPower < 0.0) | |
28 | untruncatedPower = 0.0; | |
29 | *rawPower = (uint32_t) floor(untruncatedPower*10.0); | |
30 | } | |
31 | else | |
32 | //erase the comma (and skip field then) | |
33 | nextChar = fgetc(ifile); | |
34 | ||
35 | //continue until next comma (or line end or file end) | |
36 | while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') | |
37 | nextChar = fgetc(ifile); | |
38 | position++; | |
39 | ||
40 | if (feof(ifile) || nextChar == '\n' || nextChar == '\r') | |
41 | { | |
42 | // skip all potential line feeds | |
43 | while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') | |
44 | nextChar = fgetc(ifile); | |
45 | if (!feof(ifile)) | |
46 | ungetc(nextChar, ifile); | |
47 | break; | |
48 | } | |
49 | } | |
50 | } | |
51 | ||
52 | //main job: parse a text file into a binary compressed version | |
53 | //TODO [long term]: adapt to distributed files/DB, maybe to distributed binary chunks | |
54 | void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbItems) | |
55 | { | |
56 | // use the header to know positions of ID and rawPower | |
57 | FILE* ifile = fopen(ifileName, "r"); | |
58 | uint32_t headerShift = 0; | |
59 | char curChar; | |
60 | Vector* header = vector_new(char); | |
61 | do | |
62 | { | |
63 | curChar = fgetc(ifile); | |
64 | headerShift++; | |
65 | if (curChar == '\n' || curChar == '\r') | |
66 | { | |
67 | //flush all potential other line feeds | |
68 | while (curChar == '\n' || curChar == '\r') | |
69 | curChar = fgetc(ifile); | |
70 | ungetc(curChar, ifile); | |
71 | break; | |
72 | } | |
73 | vector_push(header, curChar); | |
74 | } | |
75 | while (1); | |
76 | char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char)); | |
77 | VectorIterator* it = vector_get_iterator(header); | |
78 | int index = 0; | |
79 | while (vectorI_has_data(it)) | |
80 | { | |
81 | vectorI_get(it, headerString[index]); | |
82 | vectorI_move_next(it); | |
83 | index++; | |
84 | } | |
85 | vectorI_destroy(it); | |
86 | headerString[index] = 0; | |
87 | vector_destroy(header); | |
88 | int position = 1, posID = 0, posPower = 0; | |
89 | char* columnDescriptor = strtok(headerString, ","); | |
90 | while (columnDescriptor != NULL) | |
91 | { | |
92 | if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id")) | |
93 | posID = position; | |
94 | else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE")) | |
95 | posPower = position; | |
96 | position++; | |
97 | columnDescriptor = strtok(NULL, ","); | |
98 | } | |
99 | free(headerString); | |
100 | ||
101 | //estimate tsLength with a scan of the 3 first series | |
102 | uint32_t ID=0, rawPower=0, lastID=0, refTsLength=0; | |
103 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
104 | //'sl' = sample lengths (short because a lot of comparisons then) | |
105 | uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t)); | |
106 | for (int i=0; i<3; i++) | |
107 | { | |
108 | lastID = ID; | |
109 | while (ID == lastID) | |
110 | { | |
111 | sl[i]++; | |
112 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
113 | } | |
114 | } | |
115 | if (sl[1] <= sl[0] <= sl[2] || sl[2] <= sl[0] <= sl[1]) | |
116 | refTsLength = sl[0]; | |
117 | else if (sl[0] <= sl[1] <= sl[2] || sl[2] <= sl[1] <= sl[0]) | |
118 | refTsLength = sl[1]; | |
119 | else | |
120 | refTsLength = sl[2]; | |
121 | free(sl); | |
122 | //go back at the beginning of the first series (ready to read '\n'...) | |
123 | fseek(ifile, headerShift-1, SEEK_SET); | |
124 | ||
125 | // output file to write time-series sequentially, binary format. | |
126 | // Format: for each series, ID is stored on 4 bytes (unsigned integer32). Then, | |
127 | // (<rawPower>)+ follow, with rawPower stored as a "3 bytes int" | |
128 | // rawPower values are multiplied by 10 and truncated one digit after 0 | |
129 | // NOTE: no raw power should be exactly zero | |
130 | FILE* ofile = fopen(ofileName, "wb"); | |
131 | ||
132 | // leave space to write the number of series (32bits), and their length in bytes (32bits) | |
133 | for (int i = 0; i < 8; i++) | |
134 | fputc(0, ofile); | |
135 | ||
136 | // process one client (ID in first column) at a time | |
137 | uint64_t processedLines = 0; //execution trace | |
138 | uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; | |
139 | uint32_t mismatchLengthCount=0, overflowCount=0; | |
140 | Byte tsBuffer[4+3*refTsLength]; | |
141 | int overflow = 0; | |
142 | lastID = 0; | |
143 | while (!feof(ifile)) | |
144 | { | |
145 | // next element to read always start with a digit | |
146 | do | |
147 | curChar = fgetc(ifile); | |
148 | while (!feof(ifile) && (curChar < '0' || curChar > '9')); | |
149 | if (feof(ifile)) | |
150 | break; | |
151 | ungetc(curChar, ifile); | |
152 | ||
153 | // read line | |
154 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
155 | if (ID != lastID) | |
156 | { | |
157 | //just starting a new time-series: must process the last one (if there is a last one !) | |
158 | if (lastID > 0) | |
159 | { | |
160 | if (tsLength == refTsLength && !overflow) | |
161 | { | |
162 | seriesCount++; | |
163 | fwrite(tsBuffer, 4+3*tsLength, 1, ofile); | |
164 | if (nbItems > 0 && seriesCount >= nbItems) | |
165 | break; | |
166 | } | |
167 | //if something wrong happened, skip series | |
168 | else | |
169 | { | |
170 | skippedSeriesCount++; | |
171 | if (tsLength != refTsLength) | |
172 | mismatchLengthCount++; | |
173 | if (overflow) | |
174 | overflowCount++; | |
175 | } | |
176 | } | |
177 | ||
178 | // ID for the new series is printed only once: | |
179 | write_int(ID, 4, tsBuffer); | |
180 | // reinitialize flags | |
181 | overflow = 0; | |
182 | tsLength = 0; | |
183 | lastID = ID; | |
184 | } | |
185 | ||
186 | overflow = (overflow || (rawPower >= (1 << 24))); | |
187 | //We cannot write more than refTsLength bytes | |
188 | if (tsLength < refTsLength) | |
189 | write_int(rawPower, 3, tsBuffer + 4+3*tsLength); | |
190 | tsLength++; | |
191 | ||
192 | if ((++processedLines) % 1000000 == 0) | |
193 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); | |
194 | } | |
195 | ||
196 | if (!overflow && tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) | |
197 | { | |
198 | // flush last time-series if all conditions are met | |
199 | fwrite(tsBuffer, 4+3*tsLength, 1, ofile); | |
200 | seriesCount++; | |
201 | } | |
202 | else if (nbItems <= 0 || seriesCount < nbItems) | |
203 | { | |
204 | if (tsLength != refTsLength) | |
205 | mismatchLengthCount++; | |
206 | if (overflow) | |
207 | overflowCount++; | |
208 | } | |
209 | ||
210 | // write lines count and size of a time-series in bytes | |
211 | Byte intBuffer[4]; | |
212 | fseek(ofile, 0, SEEK_SET); | |
213 | write_int(seriesCount, 4, intBuffer); | |
214 | fwrite(intBuffer, 1, 4, ofile); | |
215 | // re-express tsLength in bytes (not forgetting the ID)) | |
216 | write_int(4 + 3 * refTsLength, 4, intBuffer); | |
217 | fwrite(intBuffer, 1, 4, ofile); | |
218 | ||
219 | // finally print some statistics | |
220 | if (seriesCount < nbItems) | |
221 | fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount); | |
222 | fprintf(stdout,"%u overflows / %u mismatch series lengths.\n",overflowCount,mismatchLengthCount); | |
223 | ||
224 | fclose(ifile); | |
225 | fclose(ofile); | |
226 | } | |
227 | ||
228 | //serialize from usual 'by-row' data (for StarLight example and toy dataset) | |
229 | void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbItems) | |
230 | { | |
231 | FILE* ifile = fopen(ifileName, "r"); | |
232 | // first scan to know dimensions | |
233 | uint32_t nbValues = 0; //ID (or, more useful, real class number) comes first | |
234 | char curChar = ' '; | |
235 | while (curChar != '\n' && curChar != '\r') | |
236 | { | |
237 | curChar = fgetc(ifile); | |
238 | if (curChar == ',') | |
239 | { | |
240 | nbValues++; | |
241 | //skip potential consecutive commas (could be hard to spot) | |
242 | while (curChar == ',') | |
243 | curChar = fgetc(ifile); | |
244 | ungetc(curChar, ifile); | |
245 | } | |
246 | } | |
247 | while (curChar == '\n' || curChar == '\r') | |
248 | curChar = fgetc(ifile); | |
249 | ungetc(curChar, ifile); | |
250 | uint32_t nbSeries = 1; //we already read 1st line | |
251 | while (!feof(ifile)) | |
252 | { | |
253 | if ((curChar = fgetc(ifile)) == '\n') | |
254 | nbSeries++; | |
255 | } | |
256 | fseek(ifile, 0, SEEK_SET); | |
257 | ||
258 | //write meta info first | |
259 | uint32_t tsLength = 3*nbValues+4; | |
260 | FILE* ofile = fopen(ofileName, "wb"); | |
261 | Byte intBuffer[4]; | |
262 | write_int(nbSeries, 4, intBuffer); | |
263 | fwrite(intBuffer, 1, 4, ofile); | |
264 | write_int(tsLength, 4, intBuffer); | |
265 | fwrite(intBuffer, 1, 4, ofile); | |
266 | Real rawPower; | |
267 | int64_t ID; | |
268 | ||
269 | //DEBUG / TEST (ugly, TOFIX...) | |
270 | double minrp = INFINITY, maxrp = -INFINITY; | |
271 | ||
272 | for (uint32_t i=0; i<nbSeries; i++) | |
273 | { | |
274 | //skip potential line feeds before next line | |
275 | while (curChar == '\n' || curChar == '\r') | |
276 | curChar = fgetc(ifile); | |
277 | ungetc(curChar, ifile); | |
278 | curChar = readInt(ifile, &ID); | |
279 | write_int((uint32_t)ID, 4, intBuffer); | |
280 | fwrite(intBuffer, 1, 4, ofile); | |
281 | while (curChar == ',') | |
282 | curChar = fgetc(ifile); | |
283 | ungetc(curChar, ifile); | |
284 | for (uint32_t j=0; j<nbValues; j++) | |
285 | { | |
286 | curChar = readReal(ifile, &rawPower); | |
287 | ||
288 | //DEBUG / TEST (ugly, TOFIX...) | |
289 | if (rawPower < minrp) | |
290 | minrp = rawPower; | |
291 | if (rawPower > maxrp) | |
292 | maxrp = rawPower; | |
293 | ||
294 | write_int((uint32_t)floor(10.0*(rawPower+0.0)), 3, intBuffer); //x10... +3... | |
295 | fwrite(intBuffer, 1, 3, ofile); | |
296 | while (curChar == ',') | |
297 | curChar = fgetc(ifile); | |
298 | ungetc(curChar, ifile); | |
299 | } | |
300 | } | |
301 | fclose(ifile); | |
302 | fclose(ofile); | |
303 | ||
304 | //DEBUG / TEST (ugly, TOFIX...) | |
305 | printf("min / max values = %g %g\n",minrp,maxrp); | |
306 | } |