646c4407380ccf396971fc8d89bd96ace8537899
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
3 #include <cgds/Vector.h>
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream
, int* integer
)
13 char curChar
= fgetc(stream
);
14 int sign
= (curChar
== '-' ? -1 : 1);
15 while (curChar
< '0' || curChar
> '9')
16 curChar
= fgetc(stream
);
17 while (curChar
>= '0' && curChar
<= '9')
19 *integer
= 10 * (*integer
) + (int) (curChar
- '0');
20 curChar
= fgetc(stream
);
23 return curChar
; //separator, endline or .,e,E (if inside readReal)
26 // Read a real number char by char, and position the cursor to next character
27 char readReal(FILE* stream
, float* real
)
29 int integerPart
, exponent
= 0, fractionalPart
= 0, countZeros
= 0;
30 char curChar
= readInt(stream
, &integerPart
);
34 while ((curChar
= fgetc(stream
)) == '0')
36 if (curChar
>= '1' && curChar
<= '9')
38 ungetc(curChar
, stream
);
39 curChar
= readInt(stream
, &fractionalPart
);
42 if (curChar
== 'e' || curChar
== 'E')
43 curChar
= readInt(stream
, &exponent
);
44 *real
= ( integerPart
+ (integerPart
>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros
+floor(log10(fractionalPart
>0 ? fractionalPart
: 1)+1)) )
47 return curChar
; //separator or endline
50 // Parse a line into integer+float (ID, value)
51 static void scan_line(FILE* ifile
, char sep
,
52 int posID
, int* ID
, int posValue
, float* value
)
58 if (position
== posID
)
59 curChar
= readInt(ifile
, ID
);
60 else if (position
== posValue
)
61 curChar
= readReal(ifile
, value
);
63 curChar
= fgetc(ifile
); //erase the comma (and skip field then)
65 // Continue until next separator (or line end or file end)
66 while (!feof(ifile
) && curChar
!= '\n' && curChar
!= sep
)
67 curChar
= fgetc(ifile
);
70 if (curChar
== '\n' || feof(ifile
))
72 // Reached end of line
78 // Main job: parse a data file into a conventional CSV file in rows, without header
79 // Current limitations:
80 // - remove partial series (we could fill missing values instead)
81 // - consider missing fields == 0 (if ,,)
82 // - IDs should be st. pos. integers
83 // - UNIX linebreaks only (\n)
84 int transform(const char* ifileName
, int posID
, int posValue
,
85 const char* ofileName
, int nbItems
, char sep
)
87 uint64_t processedLines
= 0; //execution trace
88 uint32_t seriesCount
=0, skippedSeriesCount
=0, mismatchLengthCount
=0;
89 int tsLength
, lastID
=0, ID
, firstID
, eof
;
91 Vector
* tsBuffer
= vector_new(float);
92 FILE* ifile
= fopen(ifileName
, "r");
93 // Output file to write time-series sequentially, CSV format.
94 FILE* ofile
= fopen(ofileName
, "w");
99 curChar
= fgetc(ifile
);
100 while (curChar
!= '\n');
102 // Process one client (ID in first column) at a time
109 // Is there anything left to read? (file may end with '\n')
110 curChar
= fgetc(ifile
);
111 if (!feof(ifile
) && curChar
!= '\n')
113 // Yes: read current line
114 ungetc(curChar
, ifile
);
115 scan_line(ifile
, sep
, posID
, &ID
, posValue
, &value
);
121 if (ID
!= lastID
|| eof
)
125 // Just starting a new time-series (or EOF): process the last one
126 if (tsLength
== vector_size(tsBuffer
))
128 for (int i
=0; i
<tsLength
-1; i
++)
130 vector_get(tsBuffer
, i
, tmpVal
);
131 fprintf(ofile
, "%g%c", tmpVal
, sep
);
133 vector_get(tsBuffer
, tsLength
-1, tmpVal
);
134 fprintf(ofile
, "%g\n", tmpVal
);
136 if (nbItems
> 0 && ++seriesCount
>= nbItems
)
141 // Mismatch lengths: skip series
142 mismatchLengthCount
++;
149 // Last serie is processed
152 // Reinitialize current index of new serie
157 // Fill values buffer
160 if (tsLength
< vector_size(tsBuffer
))
161 vector_set(tsBuffer
, tsLength
, value
);
165 // First serie is reference: push all values
166 vector_push(tsBuffer
, value
);
170 if ((++processedLines
) % 1000000 == 0)
171 fprintf(stdout
,"Processed %"PRIu64
" lines\n", processedLines
);
174 // finally print some statistics
175 fprintf(stdout
,"NOTE: %u series retrieved.\n",seriesCount
);
176 if (mismatchLengthCount
> 0)
177 fprintf(stdout
,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount
);
184 int main(int argc
, char** argv
)
186 if (argc
< 4) //program name + 3 arguments
188 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
189 - ifileName: name of by-columns CSV input file\n \
190 - posID: position of the identifier in a line (start at 1)\n \
191 - posValue: position of the value of interest in a line\n \
192 - ofileName: name of the output file; default: out.csv\n \
193 - nbItems: number of series to retrieve; default: 0 (all)\n \
194 - sep: fields separator; default: ','\n");
199 return transform(argv
[1], atoi(argv
[2]), atoi(argv
[3]),
200 argc
> 4 ? argv
[4] : "out.csv",
201 argc
> 5 ? atoi(argv
[5]) : 0,
202 argc
> 6 ? argv
[6][0] : ',');