1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
3 #include <cgds/Vector.h>
8 char readInt(FILE* stream
, int64_t* integer
)
11 char curChar
= fgetc(stream
);
12 int sign
= (curChar
== '-' ? -1 : 1);
13 while (curChar
< '0' || curChar
> '9')
14 curChar
= fgetc(stream
);
15 ungetc(curChar
, stream
);
16 while ((curChar
= fgetc(stream
)) >= '0' && curChar
<= '9')
17 *integer
= 10 * (*integer
) + (int64_t) (curChar
- '0');
22 char readReal(FILE* stream
, float* real
)
25 char nextChar
= readInt(stream
, &integerPart
);
26 int64_t fractionalPart
= 0;
31 while ((nextChar
= fgetc(stream
)) == '0')
33 if (nextChar
>= '1' && nextChar
<= '9')
35 ungetc(nextChar
, stream
);
36 nextChar
= readInt(stream
, &fractionalPart
);
40 if (nextChar
== 'e' || nextChar
== 'E')
41 nextChar
= readInt(stream
, &exponent
);
42 *real
= ( integerPart
+ (integerPart
>0 ? 1. : -1.) * (float)fractionalPart
43 / pow(10,countZeros
+floor(log10(fractionalPart
>0 ? fractionalPart
: 1)+1)) )
48 // Parse a line into integer+float (ID, raw power)
49 static void scan_line(FILE* ifile
, int posID
, uint32_t* ID
, int posPower
, float* rawPower
)
55 if (position
== posID
)
58 nextChar
= readInt(ifile
, &ID_on64bits
);
59 *ID
= (uint32_t)ID_on64bits
;
61 else if (position
== posPower
)
63 float power
= FLT_MAX
; //"NA"
64 nextChar
= readReal(ifile
, &power
); //?? WARNING here... if empty field ?!
65 *rawPower
= (float) power
;
68 //erase the comma (and skip field then)
69 nextChar
= fgetc(ifile
);
71 //continue until next comma (or line end or file end)
72 while (!feof(ifile
) && nextChar
!= '\n' && nextChar
!= '\r' && nextChar
!= ',')
73 nextChar
= fgetc(ifile
);
76 if (feof(ifile
) || nextChar
== '\n' || nextChar
== '\r')
78 // skip all potential line feeds
79 while (!feof(ifile
) && nextChar
== '\n' || nextChar
== '\r')
80 nextChar
= fgetc(ifile
);
82 ungetc(nextChar
, ifile
);
88 // Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
89 void transform(const char* ifileName
, const char* ofileName
, uint32_t nbItems
)
91 // Use the header to know positions of ID and rawPower
92 FILE* ifile
= fopen(ifileName
, "r");
93 uint32_t headerShift
= 0;
95 Vector
* header
= vector_new(char);
98 curChar
= fgetc(ifile
);
100 if (curChar
== '\n' || curChar
== '\r')
102 // Flush all potential other line feeds
103 while (curChar
== '\n' || curChar
== '\r')
104 curChar
= fgetc(ifile
);
105 ungetc(curChar
, ifile
);
108 vector_push(header
, curChar
);
111 char* headerString
= (char*)malloc((vector_size(header
) + 1)*sizeof(char));
112 VectorIterator
* it
= vector_get_iterator(header
);
114 while (vectorI_has_data(it
))
116 vectorI_get(it
, headerString
[index
]);
117 vectorI_move_next(it
);
121 headerString
[index
] = 0;
122 vector_destroy(header
);
123 int position
= 1, posID
= 0, posPower
= 0;
124 char* columnDescriptor
= strtok(headerString
, ",");
125 while (columnDescriptor
!= NULL
)
127 if (!strcmp(columnDescriptor
,"FK_CCU_ID") || !strcmp(columnDescriptor
,"fk_ccu_id"))
129 else if (!strcmp(columnDescriptor
,"CPP_PUISSANCE_BRUTE"))
132 columnDescriptor
= strtok(NULL
, ",");
136 // Estimate tsLength with a scan of the 3 first series
137 uint32_t ID
=0, lastID
=0, refTsLength
=0;
139 scan_line(ifile
, posID
, &ID
, posPower
, &rawPower
);
140 //'sl' = sample lengths (short because a lot of comparisons then)
141 uint32_t* sl
= (uint32_t*) calloc(3, sizeof(uint32_t));
142 for (int i
=0; i
<3; i
++)
148 scan_line(ifile
, posID
, &ID
, posPower
, &rawPower
);
151 if (sl
[0] <= sl
[1] <= sl
[2] || sl
[1] <= sl
[0] <= sl
[2])
153 else if (sl
[1] <= sl
[2] <= sl
[0] || sl
[2] <= sl
[1] <= sl
[0])
158 //go back at the beginning of the first series (ready to read '\n'...)
159 fseek(ifile
, headerShift
-1, SEEK_SET
);
161 // output file to write time-series sequentially, CSV format.
162 FILE* ofile
= fopen(ofileName
, "w");
164 // process one client (ID in first column) at a time
165 uint64_t processedLines
= 0; //execution trace
166 uint32_t seriesCount
=0, skippedSeriesCount
=0, tsLength
=0;
167 uint32_t mismatchLengthCount
=0;
168 float tsBuffer
[refTsLength
];
172 // next element to read always start with a digit
174 curChar
= fgetc(ifile
);
175 while (!feof(ifile
) && (curChar
< '0' || curChar
> '9'));
178 ungetc(curChar
, ifile
);
181 scan_line(ifile
, posID
, &ID
, posPower
, &rawPower
);
184 //just starting a new time-series: must process the last one (if there is a last one !)
187 if (tsLength
== refTsLength
)
189 for (int i
=0; i
<tsLength
; i
++)
191 char* format
= i
<tsLength
-1 ? "%g," : "%g";
192 fprintf(ofile
, format
, tsBuffer
[i
]);
194 fprintf(ofile
, "\n");
195 if (nbItems
> 0 && ++seriesCount
>= nbItems
)
198 //if something wrong happened, skip series
201 skippedSeriesCount
++;
202 if (tsLength
!= refTsLength
)
203 mismatchLengthCount
++;
207 // reinitialize flags
212 //We cannot write more than refTsLength values
213 if (tsLength
< refTsLength
)
214 tsBuffer
[tsLength
++] = rawPower
;
216 if ((++processedLines
) % 1000000 == 0)
217 fprintf(stdout
,"Processed %"PRIu64
" lines\n", processedLines
);
220 if (tsLength
== refTsLength
&& (nbItems
<= 0 || seriesCount
< nbItems
))
222 // flush last time-series if all conditions are met
223 for (int i
=0; i
<tsLength
; i
++)
225 char* format
= i
<tsLength
-1 ? "%g," : "%g";
226 fprintf(ofile
, format
, tsBuffer
[i
]);
228 fprintf(ofile
, "\n");
231 else if (nbItems
<= 0 || seriesCount
< nbItems
)
233 if (tsLength
!= refTsLength
)
234 mismatchLengthCount
++;
237 // finally print some statistics
238 if (seriesCount
< nbItems
)
239 fprintf(stdout
,"Warning: only %u series retrieved.\n",seriesCount
);
240 fprintf(stdout
,"%u mismatch series lengths.\n",mismatchLengthCount
);