add some prints in convert_to_CSV.c; almost working. TODO...
[epclust.git] / data / preprocessing / convert_to_CSV.c
CommitLineData
c133b1bd
BA
1#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2#include <inttypes.h>
3#include <cgds/Vector.h>
4#include <string.h>
5#include <math.h>
6#include <float.h>
22037304 7#include <stdio.h>
c133b1bd 8
a0fa5bd0
BA
9// Read an integer char by char, and position the cursor to next character
10char readInt(FILE* stream, int* integer)
c133b1bd
BA
11{
12 *integer = 0;
a0fa5bd0
BA
13 char nextChar = fgetc(stream);
14 int sign = (nextChar == '-' ? -1 : 1);
15 while (nextChar < '0' || nextChar > '9')
16 nextChar = fgetc(stream);
17 ungetc(nextChar, stream);
18 while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
22037304
BA
19 {
20
21 printf("next char: %c\n",nextChar);
22
23 *integer = 10 * (*integer) + (int) (nextChar - '0'); }
c133b1bd 24 (*integer) *= sign;
22037304 25printf("INTEGER: %i\n",*integer);
a0fa5bd0 26 return nextChar;
c133b1bd
BA
27}
28
a0fa5bd0 29// Read a real number char by char, and position the cursor to next character
c133b1bd
BA
30char readReal(FILE* stream, float* real)
31{
a0fa5bd0 32 int integerPart;
c133b1bd 33 char nextChar = readInt(stream, &integerPart);
a0fa5bd0 34 int fractionalPart = 0;
c133b1bd
BA
35 int countZeros = 0;
36 if (nextChar == '.')
37 {
38 //need to count zeros
39 while ((nextChar = fgetc(stream)) == '0')
40 countZeros++;
41 if (nextChar >= '1' && nextChar <= '9')
42 {
43 ungetc(nextChar, stream);
44 nextChar = readInt(stream, &fractionalPart);
45 }
46 }
a0fa5bd0 47 int exponent = 0;
c133b1bd
BA
48 if (nextChar == 'e' || nextChar == 'E')
49 nextChar = readInt(stream, &exponent);
50 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
51 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
52 * pow(10,exponent);
53 return nextChar;
54}
55
a0fa5bd0
BA
56// Parse a line into integer+float (ID, value)
57static void scan_line(FILE* ifile, char sep,
58 int posID, int* ID, int posValue, float* value)
c133b1bd
BA
59{
60 char nextChar;
61 int position = 1;
62 while (1)
63 {
a0fa5bd0
BA
64 if (position == posID)
65 nextChar = readInt(ifile, ID);
66 else if (position == posValue)
67 nextChar = readReal(ifile, value);
c133b1bd 68 else
a0fa5bd0 69 nextChar = fgetc(ifile); //erase the comma (and skip field then)
c133b1bd 70
a0fa5bd0
BA
71 // Continue until next separator (or line end or file end)
72 while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
c133b1bd
BA
73 nextChar = fgetc(ifile);
74 position++;
75
76 if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
77 {
a0fa5bd0 78 // Skip all potential line feeds
c133b1bd
BA
79 while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
80 nextChar = fgetc(ifile);
81 if (!feof(ifile))
82 ungetc(nextChar, ifile);
83 break;
84 }
85 }
86}
87
86223e27 88// Main job: parse a data file into a conventional CSV file in rows, without header
a0fa5bd0
BA
89// Current limitations:
90// - remove partial series (we could fill missing values instead)
91// - consider missing fields == 0
92// - IDs should be integers
93int transform(const char* ifileName, int posID, int posValue,
94 const char* ofileName, int nbItems, char sep)
c133b1bd 95{
86223e27 96 FILE* ifile = fopen(ifileName, "r");
a0fa5bd0 97 // Output file to write time-series sequentially, CSV format.
c133b1bd
BA
98 FILE* ofile = fopen(ofileName, "w");
99
86223e27 100 // Skip header
a0fa5bd0 101 char curChar;
86223e27 102 do
a0fa5bd0
BA
103 curChar = fgetc(ifile);
104 while (!feof(ifile) && curChar != '\n' && curChar != '\r');
86223e27 105
a0fa5bd0 106 // Process one client (ID in first column) at a time
c133b1bd 107 uint64_t processedLines = 0; //execution trace
a0fa5bd0
BA
108 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
109 int tsLength=0, refTsLength=0, lastID=0, ID=0;
110 float value=0., tsBuffer[refTsLength];
c133b1bd
BA
111 while (!feof(ifile))
112 {
a0fa5bd0
BA
113 // Go to next line
114 while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
c133b1bd 115 curChar = fgetc(ifile);
c133b1bd
BA
116 if (feof(ifile))
117 break;
118 ungetc(curChar, ifile);
119
a0fa5bd0
BA
120 // Read current line
121 scan_line(ifile, sep, posID, &ID, posValue, &value);
22037304
BA
122
123 printf("SCAN: %i %g\n",ID,value);
c133b1bd
BA
124 if (ID != lastID)
125 {
a0fa5bd0 126 // Just starting a new time-series: must process the last one (if exists !)
c133b1bd
BA
127 if (lastID > 0)
128 {
129 if (tsLength == refTsLength)
130 {
131 for (int i=0; i<tsLength; i++)
132 {
a0fa5bd0
BA
133 char* format = i<tsLength-1 ? "%g%c" : "%g";
134 fprintf(ofile, format, tsBuffer[i], sep);
c133b1bd
BA
135 }
136 fprintf(ofile, "\n");
137 if (nbItems > 0 && ++seriesCount >= nbItems)
138 break;
139 }
a0fa5bd0 140 // Mismatch lengths: skip series
c133b1bd 141 else
a0fa5bd0 142 mismatchLengthCount++;
c133b1bd 143 }
22037304
BA
144 else
145 refTsLength = tsLength; //first serie is considered clean
c133b1bd
BA
146
147 // reinitialize flags
148 tsLength = 0;
149 lastID = ID;
150 }
22037304 151printf("LA %i %i\n",tsLength,refTsLength);
c133b1bd
BA
152 //We cannot write more than refTsLength values
153 if (tsLength < refTsLength)
a0fa5bd0 154 tsBuffer[tsLength++] = value;
c133b1bd
BA
155
156 if ((++processedLines) % 1000000 == 0)
157 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
158 }
159
a0fa5bd0 160 if (nbItems <= 0 || seriesCount < nbItems)
c133b1bd
BA
161 {
162 // flush last time-series if all conditions are met
a0fa5bd0 163 if (tsLength == refTsLength)
c133b1bd 164 {
a0fa5bd0
BA
165 for (int i=0; i<tsLength; i++)
166 {
167 char* format = i<tsLength-1 ? "%g%c" : "%g";
168 fprintf(ofile, format, tsBuffer[i], sep);
169 }
170 fprintf(ofile, "\n");
171 seriesCount++;
c133b1bd 172 }
a0fa5bd0 173 else
c133b1bd
BA
174 mismatchLengthCount++;
175 }
176
177 // finally print some statistics
a0fa5bd0
BA
178 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
179 if (mismatchLengthCount > 0)
180 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
c133b1bd
BA
181
182 fclose(ifile);
183 fclose(ofile);
a0fa5bd0 184 return 0;
c133b1bd 185}
86223e27 186
a0fa5bd0 187int main(int argc, char** argv)
86223e27 188{
a0fa5bd0
BA
189 if (argc < 4) //program name + 3 arguments
190 {
191 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
192 - ifileName: name of by-columns CSV input file\n \
193 - posID: position of the identifier in a line (start at 1)\n \
194 - posValue: position of the value of interest in a line\n \
195 - ofileName: name of the output file; default: out.csv\n \
196 - nbItems: number of series to retrieve; default: 0 (all)\n \
197 - sep: fields separator; default: ','\n");
198 return 0;
199 }
200 else
201 {
202 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
203 argc > 4 ? argv[4] : "out.csv",
204 argc > 5 ? atoi(argv[5]) : 0,
205 argc > 6 ? argv[6][0] : ',');
206 }
86223e27 207}