b3ff3be133ecc06f671507bb39080d0d3ed44682
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
3 #include <cgds/Vector.h>
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream
, int* integer
)
13 char curChar
= fgetc(stream
);
14 int sign
= (curChar
== '-' ? -1 : 1);
15 while (curChar
< '0' || curChar
> '9')
16 curChar
= fgetc(stream
);
17 while (curChar
>= '0' && curChar
<= '9')
19 *integer
= 10 * (*integer
) + (int) (curChar
- '0');
20 curChar
= fgetc(stream
);
23 return curChar
; //separator, endline or .,e,E (if inside readReal)
26 // Read a real number char by char, and position the cursor to next character
27 char readReal(FILE* stream
, float* real
)
29 int integerPart
, exponent
= 0, fractionalPart
= 0, countZeros
= 0;
30 char curChar
= readInt(stream
, &integerPart
);
34 while ((curChar
= fgetc(stream
)) == '0')
36 if (curChar
>= '1' && curChar
<= '9')
38 ungetc(curChar
, stream
);
39 curChar
= readInt(stream
, &fractionalPart
);
42 if (curChar
== 'e' || curChar
== 'E')
43 curChar
= readInt(stream
, &exponent
);
44 *real
= ( integerPart
+ (integerPart
>=0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros
+floor(log10(fractionalPart
>0 ? fractionalPart
: 1)+1)) )
48 return curChar
; //separator or endline
51 // Parse a line into integer+float (ID, value)
52 static void scan_line(FILE* ifile
, char sep
,
53 int posID
, int* ID
, int posValue
, float* value
)
59 if (position
== posID
)
60 curChar
= readInt(ifile
, ID
);
61 else if (position
== posValue
)
62 curChar
= readReal(ifile
, value
);
64 curChar
= fgetc(ifile
); //erase the comma (and skip field then)
66 // Continue until next separator (or line end or file end)
67 while (!feof(ifile
) && curChar
!= '\n' && curChar
!= sep
)
68 curChar
= fgetc(ifile
);
71 if (curChar
== '\n' || feof(ifile
))
73 // Reached end of line
79 // Main job: parse a data file into a conventional CSV file in rows, without header
80 // Current limitations:
81 // - remove partial series (we could fill missing values instead)
82 // - consider missing fields == 0 (if ,,)
83 // - IDs should be st. pos. integers
84 // - UNIX linebreaks only (\n)
85 int transform(const char* ifileName
, int posID
, int posValue
,
86 const char* ofileName
, int nbItems
, char sep
)
88 uint64_t processedLines
= 0; //execution trace
89 uint32_t seriesCount
=0, skippedSeriesCount
=0, mismatchLengthCount
=0;
90 int tsLength
, lastID
=0, ID
, firstID
, eof
;
92 Vector
* tsBuffer
= vector_new(float);
93 FILE* ifile
= fopen(ifileName
, "r");
94 // Output file to write time-series sequentially, CSV format.
95 FILE* ofile
= fopen(ofileName
, "w");
100 curChar
= fgetc(ifile
);
101 while (curChar
!= '\n');
103 // Process one client (ID in first column) at a time
110 // Is there anything left to read? (file may end with '\n')
111 curChar
= fgetc(ifile
);
112 if (!feof(ifile
) && curChar
!= '\n')
114 // Yes: read current line
115 ungetc(curChar
, ifile
);
116 scan_line(ifile
, sep
, posID
, &ID
, posValue
, &value
);
122 if (ID
!= lastID
|| eof
)
126 // Just starting a new time-series (or EOF): process the last one
127 if (tsLength
== vector_size(tsBuffer
))
129 for (int i
=0; i
<tsLength
-1; i
++)
131 vector_get(tsBuffer
, i
, tmpVal
);
132 fprintf(ofile
, "%g%c", tmpVal
, sep
);
134 vector_get(tsBuffer
, tsLength
-1, tmpVal
);
135 fprintf(ofile
, "%g\n", tmpVal
);
137 if (nbItems
> 0 && ++seriesCount
>= nbItems
)
142 // Mismatch lengths: skip series
143 mismatchLengthCount
++;
150 // Last serie is processed
153 // Reinitialize current index of new serie
158 // Fill values buffer
161 if (tsLength
< vector_size(tsBuffer
))
162 vector_set(tsBuffer
, tsLength
, value
);
166 // First serie is reference: push all values
167 vector_push(tsBuffer
, value
);
171 if ((++processedLines
) % 1000000 == 0)
172 fprintf(stdout
,"Processed %"PRIu64
" lines\n", processedLines
);
175 // finally print some statistics
176 fprintf(stdout
,"NOTE: %u series retrieved.\n",seriesCount
);
177 if (mismatchLengthCount
> 0)
178 fprintf(stdout
,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount
);
185 int main(int argc
, char** argv
)
187 if (argc
< 4) //program name + 3 arguments
189 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
190 - ifileName: name of by-columns CSV input file\n \
191 - posID: position of the identifier in a line (start at 1)\n \
192 - posValue: position of the value of interest in a line\n \
193 - ofileName: name of the output file; default: out.csv\n \
194 - nbItems: number of series to retrieve; default: 0 (all)\n \
195 - sep: fields separator; default: ','\n");
200 return transform(argv
[1], atoi(argv
[2]), atoi(argv
[3]),
201 argc
> 4 ? argv
[4] : "out.csv",
202 argc
> 5 ? atoi(argv
[5]) : 0,
203 argc
> 6 ? argv
[6][0] : ',');