Commit | Line | Data |
---|---|---|
b7cd987d BA |
1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
2 | #include <inttypes.h> | |
3 | #include <cgds/Vector.h> | |
4 | #include <string.h> | |
5 | #include <math.h> | |
6 | #include <float.h> | |
7 | #include <stdio.h> | |
8 | ||
9 | // Read an integer char by char, and position the cursor to next character | |
10 | char readInt(FILE* stream, int* integer) | |
11 | { | |
12 | *integer = 0; | |
13 | char curChar = fgetc(stream); | |
14 | int sign = (curChar == '-' ? -1 : 1); | |
15 | while (curChar < '0' || curChar > '9') | |
16 | curChar = fgetc(stream); | |
17 | while (curChar >= '0' && curChar <= '9') | |
18 | { | |
19 | *integer = 10 * (*integer) + (int) (curChar - '0'); | |
20 | curChar = fgetc(stream); | |
21 | } | |
22 | (*integer) *= sign; | |
23 | return curChar; //separator, endline or .,e,E (if inside readReal) | |
24 | } | |
25 | ||
26 | // Read a real number char by char, and position the cursor to next character | |
27 | char readReal(FILE* stream, float* real) | |
28 | { | |
29 | int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0; | |
30 | char curChar = readInt(stream, &integerPart); | |
31 | if (curChar == '.') | |
32 | { | |
33 | //need to count zeros | |
34 | while ((curChar = fgetc(stream)) == '0') | |
35 | countZeros++; | |
36 | if (curChar >= '1' && curChar <= '9') | |
37 | { | |
38 | ungetc(curChar, stream); | |
39 | curChar = readInt(stream, &fractionalPart); | |
40 | } | |
41 | } | |
42 | if (curChar == 'e' || curChar == 'E') | |
43 | curChar = readInt(stream, &exponent); | |
44 | *real = ( integerPart + (integerPart>=0 ? 1. : -1.) * (float)fractionalPart | |
45 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) | |
46 | * pow(10,exponent); | |
47 | ||
48 | return curChar; //separator or endline | |
49 | } | |
50 | ||
51 | // Parse a line into integer+float (ID, value) | |
52 | static void scan_line(FILE* ifile, char sep, | |
53 | int posID, int* ID, int posValue, float* value) | |
54 | { | |
55 | char curChar; | |
56 | int position = 1; | |
57 | while (1) | |
58 | { | |
59 | if (position == posID) | |
60 | curChar = readInt(ifile, ID); | |
61 | else if (position == posValue) | |
62 | curChar = readReal(ifile, value); | |
63 | else | |
64 | curChar = fgetc(ifile); //erase the comma (and skip field then) | |
65 | ||
66 | // Continue until next separator (or line end or file end) | |
67 | while (!feof(ifile) && curChar != '\n' && curChar != sep) | |
68 | curChar = fgetc(ifile); | |
69 | position++; | |
70 | ||
71 | if (curChar == '\n' || feof(ifile)) | |
72 | { | |
73 | // Reached end of line | |
74 | break; | |
75 | } | |
76 | } | |
77 | } | |
78 | ||
79 | // Main job: parse a data file into a conventional CSV file in rows, without header | |
80 | // Current limitations: | |
81 | // - remove partial series (we could fill missing values instead) | |
82 | // - consider missing fields == 0 (if ,,) | |
83 | // - IDs should be st. pos. integers | |
84 | // - UNIX linebreaks only (\n) | |
85 | int transform(const char* ifileName, int posID, int posValue, | |
86 | const char* ofileName, int nbItems, char sep) | |
87 | { | |
88 | uint64_t processedLines = 0; //execution trace | |
89 | uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; | |
90 | int tsLength, lastID=0, ID, firstID, eof; | |
91 | float value, tmpVal; | |
92 | Vector* tsBuffer = vector_new(float); | |
93 | FILE* ifile = fopen(ifileName, "r"); | |
94 | // Output file to write time-series sequentially, CSV format. | |
95 | FILE* ofile = fopen(ofileName, "w"); | |
96 | ||
97 | // Skip header | |
98 | char curChar; | |
99 | do | |
100 | curChar = fgetc(ifile); | |
101 | while (curChar != '\n'); | |
102 | ||
103 | // Process one client (ID in first column) at a time | |
104 | while (1) | |
105 | { | |
106 | ||
107 | eof = feof(ifile); | |
108 | if (!eof) | |
109 | { | |
110 | // Is there anything left to read? (file may end with '\n') | |
111 | curChar = fgetc(ifile); | |
112 | if (!feof(ifile) && curChar != '\n') | |
113 | { | |
114 | // Yes: read current line | |
115 | ungetc(curChar, ifile); | |
116 | scan_line(ifile, sep, posID, &ID, posValue, &value); | |
117 | } | |
118 | else | |
119 | eof = 1; | |
120 | } | |
121 | ||
122 | if (ID != lastID || eof) | |
123 | { | |
124 | if (lastID > 0) | |
125 | { | |
126 | // Just starting a new time-series (or EOF): process the last one | |
127 | if (tsLength == vector_size(tsBuffer)) | |
128 | { | |
129 | for (int i=0; i<tsLength-1; i++) | |
130 | { | |
131 | vector_get(tsBuffer, i, tmpVal); | |
132 | fprintf(ofile, "%g%c", tmpVal, sep); | |
133 | } | |
134 | vector_get(tsBuffer, tsLength-1, tmpVal); | |
135 | fprintf(ofile, "%g\n", tmpVal); | |
136 | seriesCount++; | |
137 | if (nbItems > 0 && ++seriesCount >= nbItems) | |
138 | break; | |
139 | } | |
140 | else | |
141 | { | |
142 | // Mismatch lengths: skip series | |
143 | mismatchLengthCount++; | |
144 | } | |
145 | } | |
146 | else | |
147 | firstID = ID; | |
148 | if (eof) | |
149 | { | |
150 | // Last serie is processed | |
151 | break; | |
152 | } | |
153 | // Reinitialize current index of new serie | |
154 | tsLength = 0; | |
155 | lastID = ID; | |
156 | } | |
157 | ||
158 | // Fill values buffer | |
159 | if (ID != firstID) | |
160 | { | |
161 | if (tsLength < vector_size(tsBuffer)) | |
162 | vector_set(tsBuffer, tsLength, value); | |
163 | } | |
164 | else | |
165 | { | |
166 | // First serie is reference: push all values | |
167 | vector_push(tsBuffer, value); | |
168 | } | |
169 | tsLength++; | |
170 | ||
171 | if ((++processedLines) % 1000000 == 0) | |
172 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); | |
173 | } | |
174 | ||
175 | // finally print some statistics | |
176 | fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount); | |
177 | if (mismatchLengthCount > 0) | |
178 | fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); | |
179 | ||
180 | fclose(ifile); | |
181 | fclose(ofile); | |
182 | return 0; | |
183 | } | |
184 | ||
185 | int main(int argc, char** argv) | |
186 | { | |
187 | if (argc < 4) //program name + 3 arguments | |
188 | { | |
189 | printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ | |
190 | - ifileName: name of by-columns CSV input file\n \ | |
191 | - posID: position of the identifier in a line (start at 1)\n \ | |
192 | - posValue: position of the value of interest in a line\n \ | |
193 | - ofileName: name of the output file; default: out.csv\n \ | |
194 | - nbItems: number of series to retrieve; default: 0 (all)\n \ | |
195 | - sep: fields separator; default: ','\n"); | |
196 | return 0; | |
197 | } | |
198 | else | |
199 | { | |
200 | return transform(argv[1], atoi(argv[2]), atoi(argv[3]), | |
201 | argc > 4 ? argv[4] : "out.csv", | |
202 | argc > 5 ? atoi(argv[5]) : 0, | |
203 | argc > 6 ? argv[6][0] : ','); | |
204 | } | |
205 | } |