fix convert_to_CSV.c
[epclust.git] / data / preprocessing / convert_to_CSV.c
CommitLineData
c133b1bd
BA
1#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
2#include <inttypes.h>
3#include <cgds/Vector.h>
4#include <string.h>
5#include <math.h>
6#include <float.h>
22037304 7#include <stdio.h>
c133b1bd 8
a0fa5bd0
BA
9// Read an integer char by char, and position the cursor to next character
10char readInt(FILE* stream, int* integer)
c133b1bd
BA
11{
12 *integer = 0;
a2fd2d76
BA
13 char curChar = fgetc(stream);
14 int sign = (curChar == '-' ? -1 : 1);
15 while (curChar < '0' || curChar > '9')
16 curChar = fgetc(stream);
17 while (curChar >= '0' && curChar <= '9')
22037304 18 {
a2fd2d76
BA
19 *integer = 10 * (*integer) + (int) (curChar - '0');
20 curChar = fgetc(stream);
21 }
c133b1bd 22 (*integer) *= sign;
a2fd2d76 23 return curChar; //separator, endline or .,e,E (if inside readReal)
c133b1bd
BA
24}
25
a0fa5bd0 26// Read a real number char by char, and position the cursor to next character
c133b1bd
BA
27char readReal(FILE* stream, float* real)
28{
a2fd2d76
BA
29 int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
30 char curChar = readInt(stream, &integerPart);
31 if (curChar == '.')
c133b1bd
BA
32 {
33 //need to count zeros
a2fd2d76 34 while ((curChar = fgetc(stream)) == '0')
c133b1bd 35 countZeros++;
a2fd2d76 36 if (curChar >= '1' && curChar <= '9')
c133b1bd 37 {
a2fd2d76
BA
38 ungetc(curChar, stream);
39 curChar = readInt(stream, &fractionalPart);
c133b1bd
BA
40 }
41 }
a2fd2d76
BA
42 if (curChar == 'e' || curChar == 'E')
43 curChar = readInt(stream, &exponent);
c133b1bd
BA
44 *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
46 * pow(10,exponent);
a2fd2d76 47 return curChar; //separator or endline
c133b1bd
BA
48}
49
a0fa5bd0
BA
50// Parse a line into integer+float (ID, value)
51static void scan_line(FILE* ifile, char sep,
52 int posID, int* ID, int posValue, float* value)
c133b1bd 53{
a2fd2d76 54 char curChar;
c133b1bd
BA
55 int position = 1;
56 while (1)
57 {
a0fa5bd0 58 if (position == posID)
a2fd2d76 59 curChar = readInt(ifile, ID);
a0fa5bd0 60 else if (position == posValue)
a2fd2d76 61 curChar = readReal(ifile, value);
c133b1bd 62 else
a2fd2d76 63 curChar = fgetc(ifile); //erase the comma (and skip field then)
c133b1bd 64
a0fa5bd0 65 // Continue until next separator (or line end or file end)
a2fd2d76
BA
66 while (!feof(ifile) && curChar != '\n' && curChar != sep)
67 curChar = fgetc(ifile);
c133b1bd
BA
68 position++;
69
a2fd2d76 70 if (curChar == '\n' || feof(ifile))
c133b1bd 71 {
a2fd2d76 72 // Reached end of line
c133b1bd
BA
73 break;
74 }
75 }
76}
77
86223e27 78// Main job: parse a data file into a conventional CSV file in rows, without header
a0fa5bd0
BA
79// Current limitations:
80// - remove partial series (we could fill missing values instead)
a2fd2d76
BA
81// - consider missing fields == 0 (if ,,)
82// - IDs should be st. pos. integers
83// - UNIX linebreaks only (\n)
a0fa5bd0
BA
84int transform(const char* ifileName, int posID, int posValue,
85 const char* ofileName, int nbItems, char sep)
c133b1bd 86{
a2fd2d76
BA
87 uint64_t processedLines = 0; //execution trace
88 uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
89 int tsLength, lastID=0, ID, firstID, eof;
90 float value, tmpVal;
91 Vector* tsBuffer = vector_new(float);
86223e27 92 FILE* ifile = fopen(ifileName, "r");
a0fa5bd0 93 // Output file to write time-series sequentially, CSV format.
c133b1bd
BA
94 FILE* ofile = fopen(ofileName, "w");
95
86223e27 96 // Skip header
a0fa5bd0 97 char curChar;
86223e27 98 do
a0fa5bd0 99 curChar = fgetc(ifile);
a2fd2d76 100 while (curChar != '\n');
86223e27 101
a0fa5bd0 102 // Process one client (ID in first column) at a time
a2fd2d76 103 while (1)
c133b1bd 104 {
c133b1bd 105
a2fd2d76
BA
106 eof = feof(ifile);
107 if (!eof)
108 {
109 // Is there anything left to read? (file may end with '\n')
110 curChar = fgetc(ifile);
111 if (!feof(ifile) && curChar != '\n')
112 {
113 // Yes: read current line
114 ungetc(curChar, ifile);
115 scan_line(ifile, sep, posID, &ID, posValue, &value);
116 }
117 else
118 eof = 1;
119 }
22037304 120
a2fd2d76 121 if (ID != lastID || eof)
c133b1bd 122 {
c133b1bd
BA
123 if (lastID > 0)
124 {
a2fd2d76
BA
125 // Just starting a new time-series (or EOF): process the last one
126 if (tsLength == vector_size(tsBuffer))
c133b1bd 127 {
a2fd2d76 128 for (int i=0; i<tsLength-1; i++)
c133b1bd 129 {
a2fd2d76
BA
130 vector_get(tsBuffer, i, tmpVal);
131 fprintf(ofile, "%g%c", tmpVal, sep);
c133b1bd 132 }
a2fd2d76
BA
133 vector_get(tsBuffer, tsLength-1, tmpVal);
134 fprintf(ofile, "%g\n", tmpVal);
135 seriesCount++;
c133b1bd
BA
136 if (nbItems > 0 && ++seriesCount >= nbItems)
137 break;
138 }
c133b1bd 139 else
a2fd2d76
BA
140 {
141 // Mismatch lengths: skip series
a0fa5bd0 142 mismatchLengthCount++;
a2fd2d76 143 }
c133b1bd 144 }
22037304 145 else
a2fd2d76
BA
146 firstID = ID;
147 if (eof)
148 {
149 // Last serie is processed
150 break;
151 }
152 // Reinitialize current index of new serie
c133b1bd
BA
153 tsLength = 0;
154 lastID = ID;
155 }
c133b1bd 156
a2fd2d76
BA
157 // Fill values buffer
158 if (ID != firstID)
c133b1bd 159 {
a2fd2d76
BA
160 if (tsLength < vector_size(tsBuffer))
161 vector_set(tsBuffer, tsLength, value);
c133b1bd 162 }
a0fa5bd0 163 else
a2fd2d76
BA
164 {
165 // First serie is reference: push all values
166 vector_push(tsBuffer, value);
167 }
168 tsLength++;
169
170 if ((++processedLines) % 1000000 == 0)
171 fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
c133b1bd
BA
172 }
173
174 // finally print some statistics
a0fa5bd0
BA
175 fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
176 if (mismatchLengthCount > 0)
177 fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
c133b1bd
BA
178
179 fclose(ifile);
180 fclose(ofile);
a0fa5bd0 181 return 0;
c133b1bd 182}
86223e27 183
a0fa5bd0 184int main(int argc, char** argv)
86223e27 185{
a0fa5bd0
BA
186 if (argc < 4) //program name + 3 arguments
187 {
188 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
189 - ifileName: name of by-columns CSV input file\n \
190 - posID: position of the identifier in a line (start at 1)\n \
191 - posValue: position of the value of interest in a line\n \
192 - ofileName: name of the output file; default: out.csv\n \
193 - nbItems: number of series to retrieve; default: 0 (all)\n \
194 - sep: fields separator; default: ','\n");
195 return 0;
196 }
197 else
198 {
199 return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
200 argc > 4 ? argv[4] : "out.csv",
201 argc > 5 ? atoi(argv[5]) : 0,
202 argc > 6 ? argv[6][0] : ',');
203 }
86223e27 204}