34cb6e453adc10205e89a87099600b5f13f40984
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
3 #include <cgds/Vector.h>
8 // Read an integer char by char, and position the cursor to next character
9 char readInt(FILE* stream
, int* integer
)
12 char nextChar
= fgetc(stream
);
13 int sign
= (nextChar
== '-' ? -1 : 1);
14 while (nextChar
< '0' || nextChar
> '9')
15 nextChar
= fgetc(stream
);
16 ungetc(nextChar
, stream
);
17 while ((nextChar
= fgetc(stream
)) >= '0' && nextChar
<= '9')
18 *integer
= 10 * (*integer
) + (int) (nextChar
- '0');
23 // Read a real number char by char, and position the cursor to next character
24 char readReal(FILE* stream
, float* real
)
27 char nextChar
= readInt(stream
, &integerPart
);
28 int fractionalPart
= 0;
33 while ((nextChar
= fgetc(stream
)) == '0')
35 if (nextChar
>= '1' && nextChar
<= '9')
37 ungetc(nextChar
, stream
);
38 nextChar
= readInt(stream
, &fractionalPart
);
42 if (nextChar
== 'e' || nextChar
== 'E')
43 nextChar
= readInt(stream
, &exponent
);
44 *real
= ( integerPart
+ (integerPart
>0 ? 1. : -1.) * (float)fractionalPart
45 / pow(10,countZeros
+floor(log10(fractionalPart
>0 ? fractionalPart
: 1)+1)) )
50 // Parse a line into integer+float (ID, value)
51 static void scan_line(FILE* ifile
, char sep
,
52 int posID
, int* ID
, int posValue
, float* value
)
58 if (position
== posID
)
59 nextChar
= readInt(ifile
, ID
);
60 else if (position
== posValue
)
61 nextChar
= readReal(ifile
, value
);
63 nextChar
= fgetc(ifile
); //erase the comma (and skip field then)
65 // Continue until next separator (or line end or file end)
66 while (!feof(ifile
) && nextChar
!= '\n' && nextChar
!= '\r' && nextChar
!= sep
)
67 nextChar
= fgetc(ifile
);
70 if (feof(ifile
) || nextChar
== '\n' || nextChar
== '\r')
72 // Skip all potential line feeds
73 while (!feof(ifile
) && nextChar
== '\n' || nextChar
== '\r')
74 nextChar
= fgetc(ifile
);
76 ungetc(nextChar
, ifile
);
82 // Main job: parse a data file into a conventional CSV file in rows, without header
83 // Current limitations:
84 // - remove partial series (we could fill missing values instead)
85 // - consider missing fields == 0
86 // - IDs should be integers
87 int transform(const char* ifileName
, int posID
, int posValue
,
88 const char* ofileName
, int nbItems
, char sep
)
90 FILE* ifile
= fopen(ifileName
, "r");
91 // Output file to write time-series sequentially, CSV format.
92 FILE* ofile
= fopen(ofileName
, "w");
97 curChar
= fgetc(ifile
);
98 while (!feof(ifile
) && curChar
!= '\n' && curChar
!= '\r');
100 // Process one client (ID in first column) at a time
101 uint64_t processedLines
= 0; //execution trace
102 uint32_t seriesCount
=0, skippedSeriesCount
=0, mismatchLengthCount
=0;
103 int tsLength
=0, refTsLength
=0, lastID
=0, ID
=0;
104 float value
=0., tsBuffer
[refTsLength
];
108 while (!feof(ifile
) && (curChar
== '\n' || curChar
== '\r'))
109 curChar
= fgetc(ifile
);
112 ungetc(curChar
, ifile
);
115 scan_line(ifile
, sep
, posID
, &ID
, posValue
, &value
);
118 // Just starting a new time-series: must process the last one (if exists !)
121 if (refTsLength
== 0)
122 refTsLength
= tsLength
; //first serie is considered clean
123 if (tsLength
== refTsLength
)
125 for (int i
=0; i
<tsLength
; i
++)
127 char* format
= i
<tsLength
-1 ? "%g%c" : "%g";
128 fprintf(ofile
, format
, tsBuffer
[i
], sep
);
130 fprintf(ofile
, "\n");
131 if (nbItems
> 0 && ++seriesCount
>= nbItems
)
134 // Mismatch lengths: skip series
136 mismatchLengthCount
++;
139 // reinitialize flags
144 //We cannot write more than refTsLength values
145 if (tsLength
< refTsLength
)
146 tsBuffer
[tsLength
++] = value
;
148 if ((++processedLines
) % 1000000 == 0)
149 fprintf(stdout
,"Processed %"PRIu64
" lines\n", processedLines
);
152 if (nbItems
<= 0 || seriesCount
< nbItems
)
154 // flush last time-series if all conditions are met
155 if (tsLength
== refTsLength
)
157 for (int i
=0; i
<tsLength
; i
++)
159 char* format
= i
<tsLength
-1 ? "%g%c" : "%g";
160 fprintf(ofile
, format
, tsBuffer
[i
], sep
);
162 fprintf(ofile
, "\n");
166 mismatchLengthCount
++;
169 // finally print some statistics
170 fprintf(stdout
,"NOTE: %u series retrieved.\n",seriesCount
);
171 if (mismatchLengthCount
> 0)
172 fprintf(stdout
,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount
);
179 int main(int argc
, char** argv
)
181 if (argc
< 4) //program name + 3 arguments
183 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
184 - ifileName: name of by-columns CSV input file\n \
185 - posID: position of the identifier in a line (start at 1)\n \
186 - posValue: position of the value of interest in a line\n \
187 - ofileName: name of the output file; default: out.csv\n \
188 - nbItems: number of series to retrieve; default: 0 (all)\n \
189 - sep: fields separator; default: ','\n");
194 return transform(argv
[1], atoi(argv
[2]), atoi(argv
[3]),
195 argc
> 4 ? argv
[4] : "out.csv",
196 argc
> 5 ? atoi(argv
[5]) : 0,
197 argc
> 6 ? argv
[6][0] : ',');