Commit | Line | Data |
---|---|---|
c133b1bd BA |
1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
2 | #include <inttypes.h> | |
3 | #include <cgds/Vector.h> | |
4 | #include <string.h> | |
5 | #include <math.h> | |
6 | #include <float.h> | |
22037304 | 7 | #include <stdio.h> |
c133b1bd | 8 | |
a0fa5bd0 BA |
9 | // Read an integer char by char, and position the cursor to next character |
10 | char readInt(FILE* stream, int* integer) | |
c133b1bd BA |
11 | { |
12 | *integer = 0; | |
a2fd2d76 BA |
13 | char curChar = fgetc(stream); |
14 | int sign = (curChar == '-' ? -1 : 1); | |
15 | while (curChar < '0' || curChar > '9') | |
16 | curChar = fgetc(stream); | |
17 | while (curChar >= '0' && curChar <= '9') | |
22037304 | 18 | { |
a2fd2d76 BA |
19 | *integer = 10 * (*integer) + (int) (curChar - '0'); |
20 | curChar = fgetc(stream); | |
21 | } | |
c133b1bd | 22 | (*integer) *= sign; |
a2fd2d76 | 23 | return curChar; //separator, endline or .,e,E (if inside readReal) |
c133b1bd BA |
24 | } |
25 | ||
a0fa5bd0 | 26 | // Read a real number char by char, and position the cursor to next character |
c133b1bd BA |
27 | char readReal(FILE* stream, float* real) |
28 | { | |
a2fd2d76 BA |
29 | int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0; |
30 | char curChar = readInt(stream, &integerPart); | |
31 | if (curChar == '.') | |
c133b1bd BA |
32 | { |
33 | //need to count zeros | |
a2fd2d76 | 34 | while ((curChar = fgetc(stream)) == '0') |
c133b1bd | 35 | countZeros++; |
a2fd2d76 | 36 | if (curChar >= '1' && curChar <= '9') |
c133b1bd | 37 | { |
a2fd2d76 BA |
38 | ungetc(curChar, stream); |
39 | curChar = readInt(stream, &fractionalPart); | |
c133b1bd BA |
40 | } |
41 | } | |
a2fd2d76 BA |
42 | if (curChar == 'e' || curChar == 'E') |
43 | curChar = readInt(stream, &exponent); | |
c133b1bd BA |
44 | *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart |
45 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) | |
46 | * pow(10,exponent); | |
a2fd2d76 | 47 | return curChar; //separator or endline |
c133b1bd BA |
48 | } |
49 | ||
a0fa5bd0 BA |
50 | // Parse a line into integer+float (ID, value) |
51 | static void scan_line(FILE* ifile, char sep, | |
52 | int posID, int* ID, int posValue, float* value) | |
c133b1bd | 53 | { |
a2fd2d76 | 54 | char curChar; |
c133b1bd BA |
55 | int position = 1; |
56 | while (1) | |
57 | { | |
a0fa5bd0 | 58 | if (position == posID) |
a2fd2d76 | 59 | curChar = readInt(ifile, ID); |
a0fa5bd0 | 60 | else if (position == posValue) |
a2fd2d76 | 61 | curChar = readReal(ifile, value); |
c133b1bd | 62 | else |
a2fd2d76 | 63 | curChar = fgetc(ifile); //erase the comma (and skip field then) |
c133b1bd | 64 | |
a0fa5bd0 | 65 | // Continue until next separator (or line end or file end) |
a2fd2d76 BA |
66 | while (!feof(ifile) && curChar != '\n' && curChar != sep) |
67 | curChar = fgetc(ifile); | |
c133b1bd BA |
68 | position++; |
69 | ||
a2fd2d76 | 70 | if (curChar == '\n' || feof(ifile)) |
c133b1bd | 71 | { |
a2fd2d76 | 72 | // Reached end of line |
c133b1bd BA |
73 | break; |
74 | } | |
75 | } | |
76 | } | |
77 | ||
86223e27 | 78 | // Main job: parse a data file into a conventional CSV file in rows, without header |
a0fa5bd0 BA |
79 | // Current limitations: |
80 | // - remove partial series (we could fill missing values instead) | |
a2fd2d76 BA |
81 | // - consider missing fields == 0 (if ,,) |
82 | // - IDs should be st. pos. integers | |
83 | // - UNIX linebreaks only (\n) | |
a0fa5bd0 BA |
84 | int transform(const char* ifileName, int posID, int posValue, |
85 | const char* ofileName, int nbItems, char sep) | |
c133b1bd | 86 | { |
a2fd2d76 BA |
87 | uint64_t processedLines = 0; //execution trace |
88 | uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; | |
89 | int tsLength, lastID=0, ID, firstID, eof; | |
90 | float value, tmpVal; | |
91 | Vector* tsBuffer = vector_new(float); | |
86223e27 | 92 | FILE* ifile = fopen(ifileName, "r"); |
a0fa5bd0 | 93 | // Output file to write time-series sequentially, CSV format. |
c133b1bd BA |
94 | FILE* ofile = fopen(ofileName, "w"); |
95 | ||
86223e27 | 96 | // Skip header |
a0fa5bd0 | 97 | char curChar; |
86223e27 | 98 | do |
a0fa5bd0 | 99 | curChar = fgetc(ifile); |
a2fd2d76 | 100 | while (curChar != '\n'); |
86223e27 | 101 | |
a0fa5bd0 | 102 | // Process one client (ID in first column) at a time |
a2fd2d76 | 103 | while (1) |
c133b1bd | 104 | { |
c133b1bd | 105 | |
a2fd2d76 BA |
106 | eof = feof(ifile); |
107 | if (!eof) | |
108 | { | |
109 | // Is there anything left to read? (file may end with '\n') | |
110 | curChar = fgetc(ifile); | |
111 | if (!feof(ifile) && curChar != '\n') | |
112 | { | |
113 | // Yes: read current line | |
114 | ungetc(curChar, ifile); | |
115 | scan_line(ifile, sep, posID, &ID, posValue, &value); | |
116 | } | |
117 | else | |
118 | eof = 1; | |
119 | } | |
22037304 | 120 | |
a2fd2d76 | 121 | if (ID != lastID || eof) |
c133b1bd | 122 | { |
c133b1bd BA |
123 | if (lastID > 0) |
124 | { | |
a2fd2d76 BA |
125 | // Just starting a new time-series (or EOF): process the last one |
126 | if (tsLength == vector_size(tsBuffer)) | |
c133b1bd | 127 | { |
a2fd2d76 | 128 | for (int i=0; i<tsLength-1; i++) |
c133b1bd | 129 | { |
a2fd2d76 BA |
130 | vector_get(tsBuffer, i, tmpVal); |
131 | fprintf(ofile, "%g%c", tmpVal, sep); | |
c133b1bd | 132 | } |
a2fd2d76 BA |
133 | vector_get(tsBuffer, tsLength-1, tmpVal); |
134 | fprintf(ofile, "%g\n", tmpVal); | |
135 | seriesCount++; | |
c133b1bd BA |
136 | if (nbItems > 0 && ++seriesCount >= nbItems) |
137 | break; | |
138 | } | |
c133b1bd | 139 | else |
a2fd2d76 BA |
140 | { |
141 | // Mismatch lengths: skip series | |
a0fa5bd0 | 142 | mismatchLengthCount++; |
a2fd2d76 | 143 | } |
c133b1bd | 144 | } |
22037304 | 145 | else |
a2fd2d76 BA |
146 | firstID = ID; |
147 | if (eof) | |
148 | { | |
149 | // Last serie is processed | |
150 | break; | |
151 | } | |
152 | // Reinitialize current index of new serie | |
c133b1bd BA |
153 | tsLength = 0; |
154 | lastID = ID; | |
155 | } | |
c133b1bd | 156 | |
a2fd2d76 BA |
157 | // Fill values buffer |
158 | if (ID != firstID) | |
c133b1bd | 159 | { |
a2fd2d76 BA |
160 | if (tsLength < vector_size(tsBuffer)) |
161 | vector_set(tsBuffer, tsLength, value); | |
c133b1bd | 162 | } |
a0fa5bd0 | 163 | else |
a2fd2d76 BA |
164 | { |
165 | // First serie is reference: push all values | |
166 | vector_push(tsBuffer, value); | |
167 | } | |
168 | tsLength++; | |
169 | ||
170 | if ((++processedLines) % 1000000 == 0) | |
171 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); | |
c133b1bd BA |
172 | } |
173 | ||
174 | // finally print some statistics | |
a0fa5bd0 BA |
175 | fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount); |
176 | if (mismatchLengthCount > 0) | |
177 | fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); | |
c133b1bd BA |
178 | |
179 | fclose(ifile); | |
180 | fclose(ofile); | |
a0fa5bd0 | 181 | return 0; |
c133b1bd | 182 | } |
86223e27 | 183 | |
a0fa5bd0 | 184 | int main(int argc, char** argv) |
86223e27 | 185 | { |
a0fa5bd0 BA |
186 | if (argc < 4) //program name + 3 arguments |
187 | { | |
188 | printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ | |
189 | - ifileName: name of by-columns CSV input file\n \ | |
190 | - posID: position of the identifier in a line (start at 1)\n \ | |
191 | - posValue: position of the value of interest in a line\n \ | |
192 | - ofileName: name of the output file; default: out.csv\n \ | |
193 | - nbItems: number of series to retrieve; default: 0 (all)\n \ | |
194 | - sep: fields separator; default: ','\n"); | |
195 | return 0; | |
196 | } | |
197 | else | |
198 | { | |
199 | return transform(argv[1], atoi(argv[2]), atoi(argv[3]), | |
200 | argc > 4 ? argv[4] : "out.csv", | |
201 | argc > 5 ? atoi(argv[5]) : 0, | |
202 | argc > 6 ? argv[6][0] : ','); | |
203 | } | |
86223e27 | 204 | } |