work on CSV preprocessing
[epclust.git] / data / preprocessing / convert_to_CSV.c
CommitLineData
c133b1bd
BA
1#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2#include <inttypes.h>
3#include <cgds/Vector.h>
4#include <string.h>
5#include <math.h>
6#include <float.h>
7
a0fa5bd0
BA
8// Read an integer char by char, and position the cursor to next character
9char readInt(FILE* stream, int* integer)
c133b1bd
BA
10{
11 *integer = 0;
a0fa5bd0
BA
12 char nextChar = fgetc(stream);
13 int sign = (nextChar == '-' ? -1 : 1);
14 while (nextChar < '0' || nextChar > '9')
15 nextChar = fgetc(stream);
16 ungetc(nextChar, stream);
17 while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
18 *integer = 10 * (*integer) + (int) (nextChar - '0');
c133b1bd 19 (*integer) *= sign;
a0fa5bd0 20 return nextChar;
c133b1bd
BA
21}
22
a0fa5bd0 23// Read a real number char by char, and position the cursor to next character
c133b1bd
BA
24char readReal(FILE* stream, float* real)
25{
a0fa5bd0 26 int integerPart;
c133b1bd 27 char nextChar = readInt(stream, &integerPart);
a0fa5bd0 28 int fractionalPart = 0;
c133b1bd
BA
29 int countZeros = 0;
30 if (nextChar == '.')
31 {
32 //need to count zeros
33 while ((nextChar = fgetc(stream)) == '0')
34 countZeros++;
35 if (nextChar >= '1' && nextChar <= '9')
36 {
37 ungetc(nextChar, stream);
38 nextChar = readInt(stream, &fractionalPart);
39 }
40 }
a0fa5bd0 41 int exponent = 0;
c133b1bd
BA
42 if (nextChar == 'e' || nextChar == 'E')
43 nextChar = readInt(stream, &exponent);
44 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
46 * pow(10,exponent);
47 return nextChar;
48}
49
a0fa5bd0
BA
50// Parse a line into integer+float (ID, value)
51static void scan_line(FILE* ifile, char sep,
52 int posID, int* ID, int posValue, float* value)
c133b1bd
BA
53{
54 char nextChar;
55 int position = 1;
56 while (1)
57 {
a0fa5bd0
BA
58 if (position == posID)
59 nextChar = readInt(ifile, ID);
60 else if (position == posValue)
61 nextChar = readReal(ifile, value);
c133b1bd 62 else
a0fa5bd0 63 nextChar = fgetc(ifile); //erase the comma (and skip field then)
c133b1bd 64
a0fa5bd0
BA
65 // Continue until next separator (or line end or file end)
66 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
c133b1bd
BA
67 nextChar = fgetc(ifile);
68 position++;
69
70 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
71 {
a0fa5bd0 72 // Skip all potential line feeds
c133b1bd
BA
73 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
74 nextChar = fgetc(ifile);
75 if (!feof(ifile))
76 ungetc(nextChar, ifile);
77 break;
78 }
79 }
80}
81
86223e27 82// Main job: parse a data file into a conventional CSV file in rows, without header
a0fa5bd0
BA
83// Current limitations:
84// - remove partial series (we could fill missing values instead)
85// - consider missing fields == 0
86// - IDs should be integers
87int transform(const char* ifileName, int posID, int posValue,
88 const char* ofileName, int nbItems, char sep)
c133b1bd 89{
86223e27 90 FILE* ifile = fopen(ifileName, "r");
a0fa5bd0 91 // Output file to write time-series sequentially, CSV format.
c133b1bd
BA
92 FILE* ofile = fopen(ofileName, "w");
93
86223e27 94 // Skip header
a0fa5bd0 95 char curChar;
86223e27 96 do
a0fa5bd0
BA
97 curChar = fgetc(ifile);
98 while (!feof(ifile) && curChar != '\n' && curChar != '\r');
86223e27 99
a0fa5bd0 100 // Process one client (ID in first column) at a time
c133b1bd 101 uint64_t processedLines = 0; //execution trace
a0fa5bd0
BA
102 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
103 int tsLength=0, refTsLength=0, lastID=0, ID=0;
104 float value=0., tsBuffer[refTsLength];
c133b1bd
BA
105 while (!feof(ifile))
106 {
a0fa5bd0
BA
107 // Go to next line
108 while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
c133b1bd 109 curChar = fgetc(ifile);
c133b1bd
BA
110 if (feof(ifile))
111 break;
112 ungetc(curChar, ifile);
113
a0fa5bd0
BA
114 // Read current line
115 scan_line(ifile, sep, posID, &ID, posValue, &value);
c133b1bd
BA
116 if (ID != lastID)
117 {
a0fa5bd0 118 // Just starting a new time-series: must process the last one (if exists !)
c133b1bd
BA
119 if (lastID > 0)
120 {
a0fa5bd0
BA
121 if (refTsLength == 0)
122 refTsLength = tsLength; //first serie is considered clean
c133b1bd
BA
123 if (tsLength == refTsLength)
124 {
125 for (int i=0; i<tsLength; i++)
126 {
a0fa5bd0
BA
127 char* format = i<tsLength-1 ? "%g%c" : "%g";
128 fprintf(ofile, format, tsBuffer[i], sep);
c133b1bd
BA
129 }
130 fprintf(ofile, "\n");
131 if (nbItems > 0 && ++seriesCount >= nbItems)
132 break;
133 }
a0fa5bd0 134 // Mismatch lengths: skip series
c133b1bd 135 else
a0fa5bd0 136 mismatchLengthCount++;
c133b1bd
BA
137 }
138
139 // reinitialize flags
140 tsLength = 0;
141 lastID = ID;
142 }
143
144 //We cannot write more than refTsLength values
145 if (tsLength < refTsLength)
a0fa5bd0 146 tsBuffer[tsLength++] = value;
c133b1bd
BA
147
148 if ((++processedLines) % 1000000 == 0)
149 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
150 }
151
a0fa5bd0 152 if (nbItems <= 0 || seriesCount < nbItems)
c133b1bd
BA
153 {
154 // flush last time-series if all conditions are met
a0fa5bd0 155 if (tsLength == refTsLength)
c133b1bd 156 {
a0fa5bd0
BA
157 for (int i=0; i<tsLength; i++)
158 {
159 char* format = i<tsLength-1 ? "%g%c" : "%g";
160 fprintf(ofile, format, tsBuffer[i], sep);
161 }
162 fprintf(ofile, "\n");
163 seriesCount++;
c133b1bd 164 }
a0fa5bd0 165 else
c133b1bd
BA
166 mismatchLengthCount++;
167 }
168
169 // finally print some statistics
a0fa5bd0
BA
170 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
171 if (mismatchLengthCount > 0)
172 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
c133b1bd
BA
173
174 fclose(ifile);
175 fclose(ofile);
a0fa5bd0 176 return 0;
c133b1bd 177}
86223e27 178
a0fa5bd0 179int main(int argc, char** argv)
86223e27 180{
a0fa5bd0
BA
181 if (argc < 4) //program name + 3 arguments
182 {
183 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
184 - ifileName: name of by-columns CSV input file\n \
185 - posID: position of the identifier in a line (start at 1)\n \
186 - posValue: position of the value of interest in a line\n \
187 - ofileName: name of the output file; default: out.csv\n \
188 - nbItems: number of series to retrieve; default: 0 (all)\n \
189 - sep: fields separator; default: ','\n");
190 return 0;
191 }
192 else
193 {
194 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
195 argc > 4 ? argv[4] : "out.csv",
196 argc > 5 ? atoi(argv[5]) : 0,
197 argc > 6 ? argv[6][0] : ',');
198 }
86223e27 199}