advance on data/proprocessing
[epclust.git] / data / preprocessing / convert_to_CSV.c
... / ...
CommitLineData
1#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2#include <inttypes.h>
3#include <cgds/Vector.h>
4#include <string.h>
5#include <math.h>
6#include <float.h>
7
8// Read an integer char by char, and position the cursor to next character
9char readInt(FILE* stream, int* integer)
10{
11 *integer = 0;
12 char nextChar = fgetc(stream);
13 int sign = (nextChar == '-' ? -1 : 1);
14 while (nextChar < '0' || nextChar > '9')
15 nextChar = fgetc(stream);
16 ungetc(nextChar, stream);
17 while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
18 *integer = 10 * (*integer) + (int) (nextChar - '0');
19 (*integer) *= sign;
20 return nextChar;
21}
22
23// Read a real number char by char, and position the cursor to next character
24char readReal(FILE* stream, float* real)
25{
26 int integerPart;
27 char nextChar = readInt(stream, &integerPart);
28 int fractionalPart = 0;
29 int countZeros = 0;
30 if (nextChar == '.')
31 {
32 //need to count zeros
33 while ((nextChar = fgetc(stream)) == '0')
34 countZeros++;
35 if (nextChar >= '1' && nextChar <= '9')
36 {
37 ungetc(nextChar, stream);
38 nextChar = readInt(stream, &fractionalPart);
39 }
40 }
41 int exponent = 0;
42 if (nextChar == 'e' || nextChar == 'E')
43 nextChar = readInt(stream, &exponent);
44 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
46 * pow(10,exponent);
47 return nextChar;
48}
49
50// Parse a line into integer+float (ID, value)
51static void scan_line(FILE* ifile, char sep,
52 int posID, int* ID, int posValue, float* value)
53{
54 char nextChar;
55 int position = 1;
56 while (1)
57 {
58 if (position == posID)
59 nextChar = readInt(ifile, ID);
60 else if (position == posValue)
61 nextChar = readReal(ifile, value);
62 else
63 nextChar = fgetc(ifile); //erase the comma (and skip field then)
64
65 // Continue until next separator (or line end or file end)
66 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
67 nextChar = fgetc(ifile);
68 position++;
69
70 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
71 {
72 // Skip all potential line feeds
73 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
74 nextChar = fgetc(ifile);
75 if (!feof(ifile))
76 ungetc(nextChar, ifile);
77 break;
78 }
79 }
80}
81
82// Main job: parse a data file into a conventional CSV file in rows, without header
83// Current limitations:
84// - remove partial series (we could fill missing values instead)
85// - consider missing fields == 0
86// - IDs should be integers
87int transform(const char* ifileName, int posID, int posValue,
88 const char* ofileName, int nbItems, char sep)
89{
90 FILE* ifile = fopen(ifileName, "r");
91 // Output file to write time-series sequentially, CSV format.
92 FILE* ofile = fopen(ofileName, "w");
93
94 // Skip header
95 char curChar;
96 do
97 curChar = fgetc(ifile);
98 while (!feof(ifile) && curChar != '\n' && curChar != '\r');
99
100 // Process one client (ID in first column) at a time
101 uint64_t processedLines = 0; //execution trace
102 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
103 int tsLength=0, refTsLength=0, lastID=0, ID=0;
104 float value=0., tsBuffer[refTsLength];
105 while (!feof(ifile))
106 {
107 // Go to next line
108 while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
109 curChar = fgetc(ifile);
110 if (feof(ifile))
111 break;
112 ungetc(curChar, ifile);
113
114 // Read current line
115 scan_line(ifile, sep, posID, &ID, posValue, &value);
116 if (ID != lastID)
117 {
118 // Just starting a new time-series: must process the last one (if exists !)
119 if (lastID > 0)
120 {
121 if (refTsLength == 0)
122 refTsLength = tsLength; //first serie is considered clean
123 if (tsLength == refTsLength)
124 {
125 for (int i=0; i<tsLength; i++)
126 {
127 char* format = i<tsLength-1 ? "%g%c" : "%g";
128 fprintf(ofile, format, tsBuffer[i], sep);
129 }
130 fprintf(ofile, "\n");
131 if (nbItems > 0 && ++seriesCount >= nbItems)
132 break;
133 }
134 // Mismatch lengths: skip series
135 else
136 mismatchLengthCount++;
137 }
138
139 // reinitialize flags
140 tsLength = 0;
141 lastID = ID;
142 }
143
144 //We cannot write more than refTsLength values
145 if (tsLength < refTsLength)
146 tsBuffer[tsLength++] = value;
147
148 if ((++processedLines) % 1000000 == 0)
149 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
150 }
151
152 if (nbItems <= 0 || seriesCount < nbItems)
153 {
154 // flush last time-series if all conditions are met
155 if (tsLength == refTsLength)
156 {
157 for (int i=0; i<tsLength; i++)
158 {
159 char* format = i<tsLength-1 ? "%g%c" : "%g";
160 fprintf(ofile, format, tsBuffer[i], sep);
161 }
162 fprintf(ofile, "\n");
163 seriesCount++;
164 }
165 else
166 mismatchLengthCount++;
167 }
168
169 // finally print some statistics
170 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
171 if (mismatchLengthCount > 0)
172 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
173
174 fclose(ifile);
175 fclose(ofile);
176 return 0;
177}
178
179int main(int argc, char** argv)
180{
181 if (argc < 4) //program name + 3 arguments
182 {
183 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
184 - ifileName: name of by-columns CSV input file\n \
185 - posID: position of the identifier in a line (start at 1)\n \
186 - posValue: position of the value of interest in a line\n \
187 - ofileName: name of the output file; default: out.csv\n \
188 - nbItems: number of series to retrieve; default: 0 (all)\n \
189 - sep: fields separator; default: ','\n");
190 return 0;
191 }
192 else
193 {
194 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
195 argc > 4 ? argv[4] : "out.csv",
196 argc > 5 ? atoi(argv[5]) : 0,
197 argc > 6 ? argv[6][0] : ',');
198 }
199}