Commit | Line | Data |
---|---|---|
c133b1bd BA |
1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
2 | #include <inttypes.h> | |
3 | #include <cgds/Vector.h> | |
4 | #include <string.h> | |
5 | #include <math.h> | |
6 | #include <float.h> | |
22037304 | 7 | #include <stdio.h> |
c133b1bd | 8 | |
a0fa5bd0 BA |
9 | // Read an integer char by char, and position the cursor to next character |
10 | char readInt(FILE* stream, int* integer) | |
c133b1bd BA |
11 | { |
12 | *integer = 0; | |
a0fa5bd0 BA |
13 | char nextChar = fgetc(stream); |
14 | int sign = (nextChar == '-' ? -1 : 1); | |
15 | while (nextChar < '0' || nextChar > '9') | |
16 | nextChar = fgetc(stream); | |
17 | ungetc(nextChar, stream); | |
18 | while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9') | |
22037304 BA |
19 | { |
20 | ||
21 | printf("next char: %c\n",nextChar); | |
22 | ||
23 | *integer = 10 * (*integer) + (int) (nextChar - '0'); } | |
c133b1bd | 24 | (*integer) *= sign; |
22037304 | 25 | printf("INTEGER: %i\n",*integer); |
a0fa5bd0 | 26 | return nextChar; |
c133b1bd BA |
27 | } |
28 | ||
a0fa5bd0 | 29 | // Read a real number char by char, and position the cursor to next character |
c133b1bd BA |
30 | char readReal(FILE* stream, float* real) |
31 | { | |
a0fa5bd0 | 32 | int integerPart; |
c133b1bd | 33 | char nextChar = readInt(stream, &integerPart); |
a0fa5bd0 | 34 | int fractionalPart = 0; |
c133b1bd BA |
35 | int countZeros = 0; |
36 | if (nextChar == '.') | |
37 | { | |
38 | //need to count zeros | |
39 | while ((nextChar = fgetc(stream)) == '0') | |
40 | countZeros++; | |
41 | if (nextChar >= '1' && nextChar <= '9') | |
42 | { | |
43 | ungetc(nextChar, stream); | |
44 | nextChar = readInt(stream, &fractionalPart); | |
45 | } | |
46 | } | |
a0fa5bd0 | 47 | int exponent = 0; |
c133b1bd BA |
48 | if (nextChar == 'e' || nextChar == 'E') |
49 | nextChar = readInt(stream, &exponent); | |
50 | *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart | |
51 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) | |
52 | * pow(10,exponent); | |
53 | return nextChar; | |
54 | } | |
55 | ||
a0fa5bd0 BA |
56 | // Parse a line into integer+float (ID, value) |
57 | static void scan_line(FILE* ifile, char sep, | |
58 | int posID, int* ID, int posValue, float* value) | |
c133b1bd BA |
59 | { |
60 | char nextChar; | |
61 | int position = 1; | |
62 | while (1) | |
63 | { | |
a0fa5bd0 BA |
64 | if (position == posID) |
65 | nextChar = readInt(ifile, ID); | |
66 | else if (position == posValue) | |
67 | nextChar = readReal(ifile, value); | |
c133b1bd | 68 | else |
a0fa5bd0 | 69 | nextChar = fgetc(ifile); //erase the comma (and skip field then) |
c133b1bd | 70 | |
a0fa5bd0 BA |
71 | // Continue until next separator (or line end or file end) |
72 | while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep) | |
c133b1bd BA |
73 | nextChar = fgetc(ifile); |
74 | position++; | |
75 | ||
76 | if (feof(ifile) || nextChar == '\n' || nextChar == '\r') | |
77 | { | |
a0fa5bd0 | 78 | // Skip all potential line feeds |
c133b1bd BA |
79 | while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') |
80 | nextChar = fgetc(ifile); | |
81 | if (!feof(ifile)) | |
82 | ungetc(nextChar, ifile); | |
83 | break; | |
84 | } | |
85 | } | |
86 | } | |
87 | ||
86223e27 | 88 | // Main job: parse a data file into a conventional CSV file in rows, without header |
a0fa5bd0 BA |
89 | // Current limitations: |
90 | // - remove partial series (we could fill missing values instead) | |
91 | // - consider missing fields == 0 | |
92 | // - IDs should be integers | |
93 | int transform(const char* ifileName, int posID, int posValue, | |
94 | const char* ofileName, int nbItems, char sep) | |
c133b1bd | 95 | { |
86223e27 | 96 | FILE* ifile = fopen(ifileName, "r"); |
a0fa5bd0 | 97 | // Output file to write time-series sequentially, CSV format. |
c133b1bd BA |
98 | FILE* ofile = fopen(ofileName, "w"); |
99 | ||
86223e27 | 100 | // Skip header |
a0fa5bd0 | 101 | char curChar; |
86223e27 | 102 | do |
a0fa5bd0 BA |
103 | curChar = fgetc(ifile); |
104 | while (!feof(ifile) && curChar != '\n' && curChar != '\r'); | |
86223e27 | 105 | |
a0fa5bd0 | 106 | // Process one client (ID in first column) at a time |
c133b1bd | 107 | uint64_t processedLines = 0; //execution trace |
a0fa5bd0 BA |
108 | uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; |
109 | int tsLength=0, refTsLength=0, lastID=0, ID=0; | |
110 | float value=0., tsBuffer[refTsLength]; | |
c133b1bd BA |
111 | while (!feof(ifile)) |
112 | { | |
a0fa5bd0 BA |
113 | // Go to next line |
114 | while (!feof(ifile) && (curChar == '\n' || curChar == '\r')) | |
c133b1bd | 115 | curChar = fgetc(ifile); |
c133b1bd BA |
116 | if (feof(ifile)) |
117 | break; | |
118 | ungetc(curChar, ifile); | |
119 | ||
a0fa5bd0 BA |
120 | // Read current line |
121 | scan_line(ifile, sep, posID, &ID, posValue, &value); | |
22037304 BA |
122 | |
123 | printf("SCAN: %i %g\n",ID,value); | |
c133b1bd BA |
124 | if (ID != lastID) |
125 | { | |
a0fa5bd0 | 126 | // Just starting a new time-series: must process the last one (if exists !) |
c133b1bd BA |
127 | if (lastID > 0) |
128 | { | |
129 | if (tsLength == refTsLength) | |
130 | { | |
131 | for (int i=0; i<tsLength; i++) | |
132 | { | |
a0fa5bd0 BA |
133 | char* format = i<tsLength-1 ? "%g%c" : "%g"; |
134 | fprintf(ofile, format, tsBuffer[i], sep); | |
c133b1bd BA |
135 | } |
136 | fprintf(ofile, "\n"); | |
137 | if (nbItems > 0 && ++seriesCount >= nbItems) | |
138 | break; | |
139 | } | |
a0fa5bd0 | 140 | // Mismatch lengths: skip series |
c133b1bd | 141 | else |
a0fa5bd0 | 142 | mismatchLengthCount++; |
c133b1bd | 143 | } |
22037304 BA |
144 | else |
145 | refTsLength = tsLength; //first serie is considered clean | |
c133b1bd BA |
146 | |
147 | // reinitialize flags | |
148 | tsLength = 0; | |
149 | lastID = ID; | |
150 | } | |
22037304 | 151 | printf("LA %i %i\n",tsLength,refTsLength); |
c133b1bd BA |
152 | //We cannot write more than refTsLength values |
153 | if (tsLength < refTsLength) | |
a0fa5bd0 | 154 | tsBuffer[tsLength++] = value; |
c133b1bd BA |
155 | |
156 | if ((++processedLines) % 1000000 == 0) | |
157 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); | |
158 | } | |
159 | ||
a0fa5bd0 | 160 | if (nbItems <= 0 || seriesCount < nbItems) |
c133b1bd BA |
161 | { |
162 | // flush last time-series if all conditions are met | |
a0fa5bd0 | 163 | if (tsLength == refTsLength) |
c133b1bd | 164 | { |
a0fa5bd0 BA |
165 | for (int i=0; i<tsLength; i++) |
166 | { | |
167 | char* format = i<tsLength-1 ? "%g%c" : "%g"; | |
168 | fprintf(ofile, format, tsBuffer[i], sep); | |
169 | } | |
170 | fprintf(ofile, "\n"); | |
171 | seriesCount++; | |
c133b1bd | 172 | } |
a0fa5bd0 | 173 | else |
c133b1bd BA |
174 | mismatchLengthCount++; |
175 | } | |
176 | ||
177 | // finally print some statistics | |
a0fa5bd0 BA |
178 | fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount); |
179 | if (mismatchLengthCount > 0) | |
180 | fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); | |
c133b1bd BA |
181 | |
182 | fclose(ifile); | |
183 | fclose(ofile); | |
a0fa5bd0 | 184 | return 0; |
c133b1bd | 185 | } |
86223e27 | 186 | |
a0fa5bd0 | 187 | int main(int argc, char** argv) |
86223e27 | 188 | { |
a0fa5bd0 BA |
189 | if (argc < 4) //program name + 3 arguments |
190 | { | |
191 | printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ | |
192 | - ifileName: name of by-columns CSV input file\n \ | |
193 | - posID: position of the identifier in a line (start at 1)\n \ | |
194 | - posValue: position of the value of interest in a line\n \ | |
195 | - ofileName: name of the output file; default: out.csv\n \ | |
196 | - nbItems: number of series to retrieve; default: 0 (all)\n \ | |
197 | - sep: fields separator; default: ','\n"); | |
198 | return 0; | |
199 | } | |
200 | else | |
201 | { | |
202 | return transform(argv[1], atoi(argv[2]), atoi(argv[3]), | |
203 | argc > 4 ? argv[4] : "out.csv", | |
204 | argc > 5 ? atoi(argv[5]) : 0, | |
205 | argc > 6 ? argv[6][0] : ','); | |
206 | } | |
86223e27 | 207 | } |