First commit
[epclust.git] / data / preprocessing / convert_to_CSV.c
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2 #include <inttypes.h>
3 #include <cgds/Vector.h>
4 #include <string.h>
5 #include <math.h>
6 #include <float.h>
7 #include <stdio.h>
8
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream, int* integer)
11 {
12 *integer = 0;
13 char curChar = fgetc(stream);
14 int sign = (curChar == '-' ? -1 : 1);
15 while (curChar < '0' || curChar > '9')
16 curChar = fgetc(stream);
17 while (curChar >= '0' && curChar <= '9')
18 {
19 *integer = 10 * (*integer) + (int) (curChar - '0');
20 curChar = fgetc(stream);
21 }
22 (*integer) *= sign;
23 return curChar; //separator, endline or .,e,E (if inside readReal)
24 }
25
26 // Read a real number char by char, and position the cursor to next character
27 char readReal(FILE* stream, float* real)
28 {
29 int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
30 char curChar = readInt(stream, &integerPart);
31 if (curChar == '.')
32 {
33 //need to count zeros
34 while ((curChar = fgetc(stream)) == '0')
35 countZeros++;
36 if (curChar >= '1' && curChar <= '9')
37 {
38 ungetc(curChar, stream);
39 curChar = readInt(stream, &fractionalPart);
40 }
41 }
42 if (curChar == 'e' || curChar == 'E')
43 curChar = readInt(stream, &exponent);
44 *real = ( integerPart + (integerPart>=0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
46 * pow(10,exponent);
47
48 return curChar; //separator or endline
49 }
50
51 // Parse a line into integer+float (ID, value)
52 static void scan_line(FILE* ifile, char sep,
53 int posID, int* ID, int posValue, float* value)
54 {
55 char curChar;
56 int position = 1;
57 while (1)
58 {
59 if (position == posID)
60 curChar = readInt(ifile, ID);
61 else if (position == posValue)
62 curChar = readReal(ifile, value);
63 else
64 curChar = fgetc(ifile); //erase the comma (and skip field then)
65
66 // Continue until next separator (or line end or file end)
67 while (!feof(ifile) && curChar != '\n' && curChar != sep)
68 curChar = fgetc(ifile);
69 position++;
70
71 if (curChar == '\n' || feof(ifile))
72 {
73 // Reached end of line
74 break;
75 }
76 }
77 }
78
79 // Main job: parse a data file into a conventional CSV file in rows, without header
80 // Current limitations:
81 // - remove partial series (we could fill missing values instead)
82 // - consider missing fields == 0 (if ,,)
83 // - IDs should be st. pos. integers
84 // - UNIX linebreaks only (\n)
85 int transform(const char* ifileName, int posID, int posValue,
86 const char* ofileName, int nbItems, char sep)
87 {
88 uint64_t processedLines = 0; //execution trace
89 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
90 int tsLength, lastID=0, ID, firstID, eof;
91 float value, tmpVal;
92 Vector* tsBuffer = vector_new(float);
93 FILE* ifile = fopen(ifileName, "r");
94 // Output file to write time-series sequentially, CSV format.
95 FILE* ofile = fopen(ofileName, "w");
96
97 // Skip header
98 char curChar;
99 do
100 curChar = fgetc(ifile);
101 while (curChar != '\n');
102
103 // Process one client (ID in first column) at a time
104 while (1)
105 {
106
107 eof = feof(ifile);
108 if (!eof)
109 {
110 // Is there anything left to read? (file may end with '\n')
111 curChar = fgetc(ifile);
112 if (!feof(ifile) && curChar != '\n')
113 {
114 // Yes: read current line
115 ungetc(curChar, ifile);
116 scan_line(ifile, sep, posID, &ID, posValue, &value);
117 }
118 else
119 eof = 1;
120 }
121
122 if (ID != lastID || eof)
123 {
124 if (lastID > 0)
125 {
126 // Just starting a new time-series (or EOF): process the last one
127 if (tsLength == vector_size(tsBuffer))
128 {
129 for (int i=0; i<tsLength-1; i++)
130 {
131 vector_get(tsBuffer, i, tmpVal);
132 fprintf(ofile, "%g%c", tmpVal, sep);
133 }
134 vector_get(tsBuffer, tsLength-1, tmpVal);
135 fprintf(ofile, "%g\n", tmpVal);
136 seriesCount++;
137 if (nbItems > 0 && ++seriesCount >= nbItems)
138 break;
139 }
140 else
141 {
142 // Mismatch lengths: skip series
143 mismatchLengthCount++;
144 }
145 }
146 else
147 firstID = ID;
148 if (eof)
149 {
150 // Last serie is processed
151 break;
152 }
153 // Reinitialize current index of new serie
154 tsLength = 0;
155 lastID = ID;
156 }
157
158 // Fill values buffer
159 if (ID != firstID)
160 {
161 if (tsLength < vector_size(tsBuffer))
162 vector_set(tsBuffer, tsLength, value);
163 }
164 else
165 {
166 // First serie is reference: push all values
167 vector_push(tsBuffer, value);
168 }
169 tsLength++;
170
171 if ((++processedLines) % 1000000 == 0)
172 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
173 }
174
175 // finally print some statistics
176 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
177 if (mismatchLengthCount > 0)
178 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
179
180 fclose(ifile);
181 fclose(ofile);
182 return 0;
183 }
184
185 int main(int argc, char** argv)
186 {
187 if (argc < 4) //program name + 3 arguments
188 {
189 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
190 - ifileName: name of by-columns CSV input file\n \
191 - posID: position of the identifier in a line (start at 1)\n \
192 - posValue: position of the value of interest in a line\n \
193 - ofileName: name of the output file; default: out.csv\n \
194 - nbItems: number of series to retrieve; default: 0 (all)\n \
195 - sep: fields separator; default: ','\n");
196 return 0;
197 }
198 else
199 {
200 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
201 argc > 4 ? argv[4] : "out.csv",
202 argc > 5 ? atoi(argv[5]) : 0,
203 argc > 6 ? argv[6][0] : ',');
204 }
205 }