646c4407380ccf396971fc8d89bd96ace8537899
[epclust.git] / data / preprocessing / convert_to_CSV.c
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2 #include <inttypes.h>
3 #include <cgds/Vector.h>
4 #include <string.h>
5 #include <math.h>
6 #include <float.h>
7 #include <stdio.h>
8
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream, int* integer)
11 {
12 *integer = 0;
13 char curChar = fgetc(stream);
14 int sign = (curChar == '-' ? -1 : 1);
15 while (curChar < '0' || curChar > '9')
16 curChar = fgetc(stream);
17 while (curChar >= '0' && curChar <= '9')
18 {
19 *integer = 10 * (*integer) + (int) (curChar - '0');
20 curChar = fgetc(stream);
21 }
22 (*integer) *= sign;
23 return curChar; //separator, endline or .,e,E (if inside readReal)
24 }
25
26 // Read a real number char by char, and position the cursor to next character
27 char readReal(FILE* stream, float* real)
28 {
29 int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
30 char curChar = readInt(stream, &integerPart);
31 if (curChar == '.')
32 {
33 //need to count zeros
34 while ((curChar = fgetc(stream)) == '0')
35 countZeros++;
36 if (curChar >= '1' && curChar <= '9')
37 {
38 ungetc(curChar, stream);
39 curChar = readInt(stream, &fractionalPart);
40 }
41 }
42 if (curChar == 'e' || curChar == 'E')
43 curChar = readInt(stream, &exponent);
44 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
46 * pow(10,exponent);
47 return curChar; //separator or endline
48 }
49
50 // Parse a line into integer+float (ID, value)
51 static void scan_line(FILE* ifile, char sep,
52 int posID, int* ID, int posValue, float* value)
53 {
54 char curChar;
55 int position = 1;
56 while (1)
57 {
58 if (position == posID)
59 curChar = readInt(ifile, ID);
60 else if (position == posValue)
61 curChar = readReal(ifile, value);
62 else
63 curChar = fgetc(ifile); //erase the comma (and skip field then)
64
65 // Continue until next separator (or line end or file end)
66 while (!feof(ifile) && curChar != '\n' && curChar != sep)
67 curChar = fgetc(ifile);
68 position++;
69
70 if (curChar == '\n' || feof(ifile))
71 {
72 // Reached end of line
73 break;
74 }
75 }
76 }
77
78 // Main job: parse a data file into a conventional CSV file in rows, without header
79 // Current limitations:
80 // - remove partial series (we could fill missing values instead)
81 // - consider missing fields == 0 (if ,,)
82 // - IDs should be st. pos. integers
83 // - UNIX linebreaks only (\n)
84 int transform(const char* ifileName, int posID, int posValue,
85 const char* ofileName, int nbItems, char sep)
86 {
87 uint64_t processedLines = 0; //execution trace
88 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
89 int tsLength, lastID=0, ID, firstID, eof;
90 float value, tmpVal;
91 Vector* tsBuffer = vector_new(float);
92 FILE* ifile = fopen(ifileName, "r");
93 // Output file to write time-series sequentially, CSV format.
94 FILE* ofile = fopen(ofileName, "w");
95
96 // Skip header
97 char curChar;
98 do
99 curChar = fgetc(ifile);
100 while (curChar != '\n');
101
102 // Process one client (ID in first column) at a time
103 while (1)
104 {
105
106 eof = feof(ifile);
107 if (!eof)
108 {
109 // Is there anything left to read? (file may end with '\n')
110 curChar = fgetc(ifile);
111 if (!feof(ifile) && curChar != '\n')
112 {
113 // Yes: read current line
114 ungetc(curChar, ifile);
115 scan_line(ifile, sep, posID, &ID, posValue, &value);
116 }
117 else
118 eof = 1;
119 }
120
121 if (ID != lastID || eof)
122 {
123 if (lastID > 0)
124 {
125 // Just starting a new time-series (or EOF): process the last one
126 if (tsLength == vector_size(tsBuffer))
127 {
128 for (int i=0; i<tsLength-1; i++)
129 {
130 vector_get(tsBuffer, i, tmpVal);
131 fprintf(ofile, "%g%c", tmpVal, sep);
132 }
133 vector_get(tsBuffer, tsLength-1, tmpVal);
134 fprintf(ofile, "%g\n", tmpVal);
135 seriesCount++;
136 if (nbItems > 0 && ++seriesCount >= nbItems)
137 break;
138 }
139 else
140 {
141 // Mismatch lengths: skip series
142 mismatchLengthCount++;
143 }
144 }
145 else
146 firstID = ID;
147 if (eof)
148 {
149 // Last serie is processed
150 break;
151 }
152 // Reinitialize current index of new serie
153 tsLength = 0;
154 lastID = ID;
155 }
156
157 // Fill values buffer
158 if (ID != firstID)
159 {
160 if (tsLength < vector_size(tsBuffer))
161 vector_set(tsBuffer, tsLength, value);
162 }
163 else
164 {
165 // First serie is reference: push all values
166 vector_push(tsBuffer, value);
167 }
168 tsLength++;
169
170 if ((++processedLines) % 1000000 == 0)
171 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
172 }
173
174 // finally print some statistics
175 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
176 if (mismatchLengthCount > 0)
177 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
178
179 fclose(ifile);
180 fclose(ofile);
181 return 0;
182 }
183
184 int main(int argc, char** argv)
185 {
186 if (argc < 4) //program name + 3 arguments
187 {
188 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
189 - ifileName: name of by-columns CSV input file\n \
190 - posID: position of the identifier in a line (start at 1)\n \
191 - posValue: position of the value of interest in a line\n \
192 - ofileName: name of the output file; default: out.csv\n \
193 - nbItems: number of series to retrieve; default: 0 (all)\n \
194 - sep: fields separator; default: ','\n");
195 return 0;
196 }
197 else
198 {
199 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
200 argc > 4 ? argv[4] : "out.csv",
201 argc > 5 ? atoi(argv[5]) : 0,
202 argc > 6 ? argv[6][0] : ',');
203 }
204 }