prepare converter for DB extracts datasets
[epclust.git] / data / preprocessing / convert_to_CSV.c
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2 #include <inttypes.h>
3 #include <cgds/Vector.h>
4 #include <string.h>
5 #include <math.h>
6 #include <float.h>
7
8 char readInt(FILE* stream, int64_t* integer)
9 {
10 *integer = 0;
11 char curChar = fgetc(stream);
12 int sign = (curChar == '-' ? -1 : 1);
13 while (curChar < '0' || curChar > '9')
14 curChar = fgetc(stream);
15 ungetc(curChar, stream);
16 while ((curChar = fgetc(stream)) >= '0' && curChar <= '9')
17 *integer = 10 * (*integer) + (int64_t) (curChar - '0');
18 (*integer) *= sign;
19 return curChar;
20 }
21
22 char readReal(FILE* stream, float* real)
23 {
24 int64_t integerPart;
25 char nextChar = readInt(stream, &integerPart);
26 int64_t fractionalPart = 0;
27 int countZeros = 0;
28 if (nextChar == '.')
29 {
30 //need to count zeros
31 while ((nextChar = fgetc(stream)) == '0')
32 countZeros++;
33 if (nextChar >= '1' && nextChar <= '9')
34 {
35 ungetc(nextChar, stream);
36 nextChar = readInt(stream, &fractionalPart);
37 }
38 }
39 int64_t exponent = 0;
40 if (nextChar == 'e' || nextChar == 'E')
41 nextChar = readInt(stream, &exponent);
42 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
43 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
44 * pow(10,exponent);
45 return nextChar;
46 }
47
48 // Parse a line into integer+float (ID, raw power)
49 static void scan_line(FILE* ifile,
50 int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
51 {
52 char nextChar;
53 int position = 1;
54 while (1)
55 {
56 if (position == posTime)
57 {
58 //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
59 }
60 else if (position == posID)
61 {
62 int64_t ID_on64bits;
63 nextChar = readInt(ifile, &ID_on64bits);
64 *ID = (uint32_t)ID_on64bits;
65 }
66 else if (position == posPower)
67 {
68 float power = FLT_MAX; //"NA"
69 nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?!
70 *rawPower = (float) power;
71 }
72 else
73 //erase the comma (and skip field then)
74 nextChar = fgetc(ifile);
75
76 //continue until next comma (or line end or file end)
77 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',')
78 nextChar = fgetc(ifile);
79 position++;
80
81 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
82 {
83 // skip all potential line feeds
84 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
85 nextChar = fgetc(ifile);
86 if (!feof(ifile))
87 ungetc(nextChar, ifile);
88 break;
89 }
90 }
91 }
92
93
94 //TODO: check datetime at each line (build datetimes file ! for each year ?)
95 //also fill NA with closest value in file (easy)
96 //01JAN2009:00:00:00 ..........
97
98
99 // Main job: parse a data file into a conventional CSV file in rows, without header
100 void transform(const char* ifileName, int posID, int posTime, int posValue,
101 char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
102 {
103 //TODO: complete timedate vector from first_time and last_time
104 // --> this gives (expected) tsLength for free
105
106 FILE* ifile = fopen(ifileName, "r");
107 // output file to write time-series sequentially, CSV format.
108 FILE* ofile = fopen(ofileName, "w");
109
110 // Skip header
111 char nextChar;
112 do
113 nextChar = fgetc(ifile);
114 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
115
116 // process one client (ID in first column) at a time
117 uint64_t processedLines = 0; //execution trace
118 uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
119 uint32_t mismatchLengthCount=0;
120 float tsBuffer[refTsLength];
121 lastID = 0;
122 while (!feof(ifile))
123 {
124 // next element to read always start with a digit
125 do
126 curChar = fgetc(ifile);
127 while (!feof(ifile) && (curChar < '0' || curChar > '9'));
128 if (feof(ifile))
129 break;
130 ungetc(curChar, ifile);
131
132 // read line
133 scan_line(ifile, posID, &ID, posPower, &rawPower);
134 if (ID != lastID)
135 {
136 //just starting a new time-series: must process the last one (if there is a last one !)
137 if (lastID > 0)
138 {
139 if (tsLength == refTsLength)
140 {
141 for (int i=0; i<tsLength; i++)
142 {
143 char* format = i<tsLength-1 ? "%g," : "%g";
144 fprintf(ofile, format, tsBuffer[i]);
145 }
146 fprintf(ofile, "\n");
147 if (nbItems > 0 && ++seriesCount >= nbItems)
148 break;
149 }
150 //if something wrong happened, skip series
151 else
152 {
153 skippedSeriesCount++;
154 if (tsLength != refTsLength)
155 mismatchLengthCount++;
156 }
157 }
158
159 // reinitialize flags
160 tsLength = 0;
161 lastID = ID;
162 }
163
164 //We cannot write more than refTsLength values
165 if (tsLength < refTsLength)
166 tsBuffer[tsLength++] = rawPower;
167
168 if ((++processedLines) % 1000000 == 0)
169 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
170 }
171
172 if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems))
173 {
174 // flush last time-series if all conditions are met
175 for (int i=0; i<tsLength; i++)
176 {
177 char* format = i<tsLength-1 ? "%g," : "%g";
178 fprintf(ofile, format, tsBuffer[i]);
179 }
180 fprintf(ofile, "\n");
181 seriesCount++;
182 }
183 else if (nbItems <= 0 || seriesCount < nbItems)
184 {
185 if (tsLength != refTsLength)
186 mismatchLengthCount++;
187 }
188
189 // finally print some statistics
190 if (seriesCount < nbItems)
191 fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount);
192 fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount);
193
194 fclose(ifile);
195 fclose(ofile);
196 }
197
198 int main(char** argv, int argc)
199 {
200 //TODO: args checks...
201 transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
202 argv[5], argv[6], argv[7], atoi(argv[8]));
203 return 0;
204 }