df31f7154a05b2ab9e9d8c337613f402adea0a32
[epclust.git] / data / preprocessing / convert_to_CSV.c
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2 #include <inttypes.h>
3 #include <cgds/Vector.h>
4 #include <string.h>
5 #include <math.h>
6 #include <float.h>
7 #include <stdio.h>
8
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream, int* integer)
11 {
12 *integer = 0;
13 char nextChar = fgetc(stream);
14 int sign = (nextChar == '-' ? -1 : 1);
15 while (nextChar < '0' || nextChar > '9')
16 nextChar = fgetc(stream);
17 ungetc(nextChar, stream);
18 while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
19 {
20
21 printf("next char: %c\n",nextChar);
22
23 *integer = 10 * (*integer) + (int) (nextChar - '0'); }
24 (*integer) *= sign;
25 printf("INTEGER: %i\n",*integer);
26 return nextChar;
27 }
28
29 // Read a real number char by char, and position the cursor to next character
30 char readReal(FILE* stream, float* real)
31 {
32 int integerPart;
33 char nextChar = readInt(stream, &integerPart);
34 int fractionalPart = 0;
35 int countZeros = 0;
36 if (nextChar == '.')
37 {
38 //need to count zeros
39 while ((nextChar = fgetc(stream)) == '0')
40 countZeros++;
41 if (nextChar >= '1' && nextChar <= '9')
42 {
43 ungetc(nextChar, stream);
44 nextChar = readInt(stream, &fractionalPart);
45 }
46 }
47 int exponent = 0;
48 if (nextChar == 'e' || nextChar == 'E')
49 nextChar = readInt(stream, &exponent);
50 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
51 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
52 * pow(10,exponent);
53 return nextChar;
54 }
55
56 // Parse a line into integer+float (ID, value)
57 static void scan_line(FILE* ifile, char sep,
58 int posID, int* ID, int posValue, float* value)
59 {
60 char nextChar;
61 int position = 1;
62 while (1)
63 {
64 if (position == posID)
65 nextChar = readInt(ifile, ID);
66 else if (position == posValue)
67 nextChar = readReal(ifile, value);
68 else
69 nextChar = fgetc(ifile); //erase the comma (and skip field then)
70
71 // Continue until next separator (or line end or file end)
72 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
73 nextChar = fgetc(ifile);
74 position++;
75
76 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
77 {
78 // Skip all potential line feeds
79 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
80 nextChar = fgetc(ifile);
81 if (!feof(ifile))
82 ungetc(nextChar, ifile);
83 break;
84 }
85 }
86 }
87
88 // Main job: parse a data file into a conventional CSV file in rows, without header
89 // Current limitations:
90 // - remove partial series (we could fill missing values instead)
91 // - consider missing fields == 0
92 // - IDs should be integers
93 int transform(const char* ifileName, int posID, int posValue,
94 const char* ofileName, int nbItems, char sep)
95 {
96 FILE* ifile = fopen(ifileName, "r");
97 // Output file to write time-series sequentially, CSV format.
98 FILE* ofile = fopen(ofileName, "w");
99
100 // Skip header
101 char curChar;
102 do
103 curChar = fgetc(ifile);
104 while (!feof(ifile) && curChar != '\n' && curChar != '\r');
105
106 // Process one client (ID in first column) at a time
107 uint64_t processedLines = 0; //execution trace
108 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
109 int tsLength=0, refTsLength=0, lastID=0, ID=0;
110 float value=0., tsBuffer[refTsLength];
111 while (!feof(ifile))
112 {
113 // Go to next line
114 while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
115 curChar = fgetc(ifile);
116 if (feof(ifile))
117 break;
118 ungetc(curChar, ifile);
119
120 // Read current line
121 scan_line(ifile, sep, posID, &ID, posValue, &value);
122
123 printf("SCAN: %i %g\n",ID,value);
124 if (ID != lastID)
125 {
126 // Just starting a new time-series: must process the last one (if exists !)
127 if (lastID > 0)
128 {
129 if (tsLength == refTsLength)
130 {
131 for (int i=0; i<tsLength; i++)
132 {
133 char* format = i<tsLength-1 ? "%g%c" : "%g";
134 fprintf(ofile, format, tsBuffer[i], sep);
135 }
136 fprintf(ofile, "\n");
137 if (nbItems > 0 && ++seriesCount >= nbItems)
138 break;
139 }
140 // Mismatch lengths: skip series
141 else
142 mismatchLengthCount++;
143 }
144 else
145 refTsLength = tsLength; //first serie is considered clean
146
147 // reinitialize flags
148 tsLength = 0;
149 lastID = ID;
150 }
151 printf("LA %i %i\n",tsLength,refTsLength);
152 //We cannot write more than refTsLength values
153 if (tsLength < refTsLength)
154 tsBuffer[tsLength++] = value;
155
156 if ((++processedLines) % 1000000 == 0)
157 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
158 }
159
160 if (nbItems <= 0 || seriesCount < nbItems)
161 {
162 // flush last time-series if all conditions are met
163 if (tsLength == refTsLength)
164 {
165 for (int i=0; i<tsLength; i++)
166 {
167 char* format = i<tsLength-1 ? "%g%c" : "%g";
168 fprintf(ofile, format, tsBuffer[i], sep);
169 }
170 fprintf(ofile, "\n");
171 seriesCount++;
172 }
173 else
174 mismatchLengthCount++;
175 }
176
177 // finally print some statistics
178 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
179 if (mismatchLengthCount > 0)
180 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
181
182 fclose(ifile);
183 fclose(ofile);
184 return 0;
185 }
186
187 int main(int argc, char** argv)
188 {
189 if (argc < 4) //program name + 3 arguments
190 {
191 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
192 - ifileName: name of by-columns CSV input file\n \
193 - posID: position of the identifier in a line (start at 1)\n \
194 - posValue: position of the value of interest in a line\n \
195 - ofileName: name of the output file; default: out.csv\n \
196 - nbItems: number of series to retrieve; default: 0 (all)\n \
197 - sep: fields separator; default: ','\n");
198 return 0;
199 }
200 else
201 {
202 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
203 argc > 4 ? argv[4] : "out.csv",
204 argc > 5 ? atoi(argv[5]) : 0,
205 argc > 6 ? argv[6][0] : ',');
206 }
207 }