Commit | Line | Data |
---|---|---|
c133b1bd BA |
1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
2 | #include <inttypes.h> | |
3 | #include <cgds/Vector.h> | |
4 | #include <string.h> | |
5 | #include <math.h> | |
6 | #include <float.h> | |
7 | ||
8 | char readInt(FILE* stream, int64_t* integer) | |
9 | { | |
10 | *integer = 0; | |
11 | char curChar = fgetc(stream); | |
12 | int sign = (curChar == '-' ? -1 : 1); | |
13 | while (curChar < '0' || curChar > '9') | |
14 | curChar = fgetc(stream); | |
15 | ungetc(curChar, stream); | |
16 | while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') | |
17 | *integer = 10 * (*integer) + (int64_t) (curChar - '0'); | |
18 | (*integer) *= sign; | |
19 | return curChar; | |
20 | } | |
21 | ||
22 | char readReal(FILE* stream, float* real) | |
23 | { | |
24 | int64_t integerPart; | |
25 | char nextChar = readInt(stream, &integerPart); | |
26 | int64_t fractionalPart = 0; | |
27 | int countZeros = 0; | |
28 | if (nextChar == '.') | |
29 | { | |
30 | //need to count zeros | |
31 | while ((nextChar = fgetc(stream)) == '0') | |
32 | countZeros++; | |
33 | if (nextChar >= '1' && nextChar <= '9') | |
34 | { | |
35 | ungetc(nextChar, stream); | |
36 | nextChar = readInt(stream, &fractionalPart); | |
37 | } | |
38 | } | |
39 | int64_t exponent = 0; | |
40 | if (nextChar == 'e' || nextChar == 'E') | |
41 | nextChar = readInt(stream, &exponent); | |
42 | *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart | |
43 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) | |
44 | * pow(10,exponent); | |
45 | return nextChar; | |
46 | } | |
47 | ||
48 | // Parse a line into integer+float (ID, raw power) | |
49 | static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower) | |
50 | { | |
51 | char nextChar; | |
52 | int position = 1; | |
53 | while (1) | |
54 | { | |
55 | if (position == posID) | |
56 | { | |
57 | int64_t ID_on64bits; | |
58 | nextChar = readInt(ifile, &ID_on64bits); | |
59 | *ID = (uint32_t)ID_on64bits; | |
60 | } | |
61 | else if (position == posPower) | |
62 | { | |
63 | float power = FLT_MAX; //"NA" | |
64 | nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! | |
65 | *rawPower = (float) power; | |
66 | } | |
67 | else | |
68 | //erase the comma (and skip field then) | |
69 | nextChar = fgetc(ifile); | |
70 | ||
71 | //continue until next comma (or line end or file end) | |
72 | while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') | |
73 | nextChar = fgetc(ifile); | |
74 | position++; | |
75 | ||
76 | if (feof(ifile) || nextChar == '\n' || nextChar == '\r') | |
77 | { | |
78 | // skip all potential line feeds | |
79 | while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') | |
80 | nextChar = fgetc(ifile); | |
81 | if (!feof(ifile)) | |
82 | ungetc(nextChar, ifile); | |
83 | break; | |
84 | } | |
85 | } | |
86 | } | |
87 | ||
88 | // Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header | |
89 | void transform(const char* ifileName, const char* ofileName, uint32_t nbItems) | |
90 | { | |
91 | // Use the header to know positions of ID and rawPower | |
92 | FILE* ifile = fopen(ifileName, "r"); | |
93 | uint32_t headerShift = 0; | |
94 | char curChar; | |
95 | Vector* header = vector_new(char); | |
96 | do | |
97 | { | |
98 | curChar = fgetc(ifile); | |
99 | headerShift++; | |
100 | if (curChar == '\n' || curChar == '\r') | |
101 | { | |
102 | // Flush all potential other line feeds | |
103 | while (curChar == '\n' || curChar == '\r') | |
104 | curChar = fgetc(ifile); | |
105 | ungetc(curChar, ifile); | |
106 | break; | |
107 | } | |
108 | vector_push(header, curChar); | |
109 | } | |
110 | while (1); | |
111 | char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char)); | |
112 | VectorIterator* it = vector_get_iterator(header); | |
113 | int index = 0; | |
114 | while (vectorI_has_data(it)) | |
115 | { | |
116 | vectorI_get(it, headerString[index]); | |
117 | vectorI_move_next(it); | |
118 | index++; | |
119 | } | |
120 | vectorI_destroy(it); | |
121 | headerString[index] = 0; | |
122 | vector_destroy(header); | |
123 | int position = 1, posID = 0, posPower = 0; | |
124 | char* columnDescriptor = strtok(headerString, ","); | |
125 | while (columnDescriptor != NULL) | |
126 | { | |
127 | if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id")) | |
128 | posID = position; | |
129 | else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE")) | |
130 | posPower = position; | |
131 | position++; | |
132 | columnDescriptor = strtok(NULL, ","); | |
133 | } | |
134 | free(headerString); | |
135 | ||
136 | // Estimate tsLength with a scan of the 3 first series | |
137 | uint32_t ID=0, lastID=0, refTsLength=0; | |
138 | float rawPower = 0.; | |
139 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
140 | //'sl' = sample lengths (short because a lot of comparisons then) | |
141 | uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t)); | |
142 | for (int i=0; i<3; i++) | |
143 | { | |
144 | lastID = ID; | |
145 | while (ID == lastID) | |
146 | { | |
147 | sl[i]++; | |
148 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
149 | } | |
150 | } | |
151 | if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2]) | |
152 | refTsLength = sl[2]; | |
153 | else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0]) | |
154 | refTsLength = sl[0]; | |
155 | else | |
156 | refTsLength = sl[1]; | |
157 | free(sl); | |
158 | //go back at the beginning of the first series (ready to read '\n'...) | |
159 | fseek(ifile, headerShift-1, SEEK_SET); | |
160 | ||
161 | // output file to write time-series sequentially, CSV format. | |
162 | FILE* ofile = fopen(ofileName, "w"); | |
163 | ||
164 | // process one client (ID in first column) at a time | |
165 | uint64_t processedLines = 0; //execution trace | |
166 | uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; | |
167 | uint32_t mismatchLengthCount=0; | |
168 | float tsBuffer[refTsLength]; | |
169 | lastID = 0; | |
170 | while (!feof(ifile)) | |
171 | { | |
172 | // next element to read always start with a digit | |
173 | do | |
174 | curChar = fgetc(ifile); | |
175 | while (!feof(ifile) && (curChar < '0' || curChar > '9')); | |
176 | if (feof(ifile)) | |
177 | break; | |
178 | ungetc(curChar, ifile); | |
179 | ||
180 | // read line | |
181 | scan_line(ifile, posID, &ID, posPower, &rawPower); | |
182 | if (ID != lastID) | |
183 | { | |
184 | //just starting a new time-series: must process the last one (if there is a last one !) | |
185 | if (lastID > 0) | |
186 | { | |
187 | if (tsLength == refTsLength) | |
188 | { | |
189 | for (int i=0; i<tsLength; i++) | |
190 | { | |
191 | char* format = i<tsLength-1 ? "%g," : "%g"; | |
192 | fprintf(ofile, format, tsBuffer[i]); | |
193 | } | |
194 | fprintf(ofile, "\n"); | |
195 | if (nbItems > 0 && ++seriesCount >= nbItems) | |
196 | break; | |
197 | } | |
198 | //if something wrong happened, skip series | |
199 | else | |
200 | { | |
201 | skippedSeriesCount++; | |
202 | if (tsLength != refTsLength) | |
203 | mismatchLengthCount++; | |
204 | } | |
205 | } | |
206 | ||
207 | // reinitialize flags | |
208 | tsLength = 0; | |
209 | lastID = ID; | |
210 | } | |
211 | ||
212 | //We cannot write more than refTsLength values | |
213 | if (tsLength < refTsLength) | |
214 | tsBuffer[tsLength++] = rawPower; | |
215 | ||
216 | if ((++processedLines) % 1000000 == 0) | |
217 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); | |
218 | } | |
219 | ||
220 | if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) | |
221 | { | |
222 | // flush last time-series if all conditions are met | |
223 | for (int i=0; i<tsLength; i++) | |
224 | { | |
225 | char* format = i<tsLength-1 ? "%g," : "%g"; | |
226 | fprintf(ofile, format, tsBuffer[i]); | |
227 | } | |
228 | fprintf(ofile, "\n"); | |
229 | seriesCount++; | |
230 | } | |
231 | else if (nbItems <= 0 || seriesCount < nbItems) | |
232 | { | |
233 | if (tsLength != refTsLength) | |
234 | mismatchLengthCount++; | |
235 | } | |
236 | ||
237 | // finally print some statistics | |
238 | if (seriesCount < nbItems) | |
239 | fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount); | |
240 | fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount); | |
241 | ||
242 | fclose(ifile); | |
243 | fclose(ofile); | |
244 | } |