df31f7154a05b2ab9e9d8c337613f402adea0a32
1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
3 #include <cgds/Vector.h>
9 // Read an integer char by char, and position the cursor to next character
10 char readInt(FILE* stream
, int* integer
)
13 char nextChar
= fgetc(stream
);
14 int sign
= (nextChar
== '-' ? -1 : 1);
15 while (nextChar
< '0' || nextChar
> '9')
16 nextChar
= fgetc(stream
);
17 ungetc(nextChar
, stream
);
18 while ((nextChar
= fgetc(stream
)) >= '0' && nextChar
<= '9')
21 printf("next char: %c\n",nextChar
);
23 *integer
= 10 * (*integer
) + (int) (nextChar
- '0'); }
25 printf("INTEGER: %i\n",*integer
);
29 // Read a real number char by char, and position the cursor to next character
30 char readReal(FILE* stream
, float* real
)
33 char nextChar
= readInt(stream
, &integerPart
);
34 int fractionalPart
= 0;
39 while ((nextChar
= fgetc(stream
)) == '0')
41 if (nextChar
>= '1' && nextChar
<= '9')
43 ungetc(nextChar
, stream
);
44 nextChar
= readInt(stream
, &fractionalPart
);
48 if (nextChar
== 'e' || nextChar
== 'E')
49 nextChar
= readInt(stream
, &exponent
);
50 *real
= ( integerPart
+ (integerPart
>0 ? 1. : -1.) * (float)fractionalPart
51 / pow(10,countZeros
+floor(log10(fractionalPart
>0 ? fractionalPart
: 1)+1)) )
56 // Parse a line into integer+float (ID, value)
57 static void scan_line(FILE* ifile
, char sep
,
58 int posID
, int* ID
, int posValue
, float* value
)
64 if (position
== posID
)
65 nextChar
= readInt(ifile
, ID
);
66 else if (position
== posValue
)
67 nextChar
= readReal(ifile
, value
);
69 nextChar
= fgetc(ifile
); //erase the comma (and skip field then)
71 // Continue until next separator (or line end or file end)
72 while (!feof(ifile
) && nextChar
!= '\n' && nextChar
!= '\r' && nextChar
!= sep
)
73 nextChar
= fgetc(ifile
);
76 if (feof(ifile
) || nextChar
== '\n' || nextChar
== '\r')
78 // Skip all potential line feeds
79 while (!feof(ifile
) && nextChar
== '\n' || nextChar
== '\r')
80 nextChar
= fgetc(ifile
);
82 ungetc(nextChar
, ifile
);
88 // Main job: parse a data file into a conventional CSV file in rows, without header
89 // Current limitations:
90 // - remove partial series (we could fill missing values instead)
91 // - consider missing fields == 0
92 // - IDs should be integers
93 int transform(const char* ifileName
, int posID
, int posValue
,
94 const char* ofileName
, int nbItems
, char sep
)
96 FILE* ifile
= fopen(ifileName
, "r");
97 // Output file to write time-series sequentially, CSV format.
98 FILE* ofile
= fopen(ofileName
, "w");
103 curChar
= fgetc(ifile
);
104 while (!feof(ifile
) && curChar
!= '\n' && curChar
!= '\r');
106 // Process one client (ID in first column) at a time
107 uint64_t processedLines
= 0; //execution trace
108 uint32_t seriesCount
=0, skippedSeriesCount
=0, mismatchLengthCount
=0;
109 int tsLength
=0, refTsLength
=0, lastID
=0, ID
=0;
110 float value
=0., tsBuffer
[refTsLength
];
114 while (!feof(ifile
) && (curChar
== '\n' || curChar
== '\r'))
115 curChar
= fgetc(ifile
);
118 ungetc(curChar
, ifile
);
121 scan_line(ifile
, sep
, posID
, &ID
, posValue
, &value
);
123 printf("SCAN: %i %g\n",ID
,value
);
126 // Just starting a new time-series: must process the last one (if exists !)
129 if (tsLength
== refTsLength
)
131 for (int i
=0; i
<tsLength
; i
++)
133 char* format
= i
<tsLength
-1 ? "%g%c" : "%g";
134 fprintf(ofile
, format
, tsBuffer
[i
], sep
);
136 fprintf(ofile
, "\n");
137 if (nbItems
> 0 && ++seriesCount
>= nbItems
)
140 // Mismatch lengths: skip series
142 mismatchLengthCount
++;
145 refTsLength
= tsLength
; //first serie is considered clean
147 // reinitialize flags
151 printf("LA %i %i\n",tsLength
,refTsLength
);
152 //We cannot write more than refTsLength values
153 if (tsLength
< refTsLength
)
154 tsBuffer
[tsLength
++] = value
;
156 if ((++processedLines
) % 1000000 == 0)
157 fprintf(stdout
,"Processed %"PRIu64
" lines\n", processedLines
);
160 if (nbItems
<= 0 || seriesCount
< nbItems
)
162 // flush last time-series if all conditions are met
163 if (tsLength
== refTsLength
)
165 for (int i
=0; i
<tsLength
; i
++)
167 char* format
= i
<tsLength
-1 ? "%g%c" : "%g";
168 fprintf(ofile
, format
, tsBuffer
[i
], sep
);
170 fprintf(ofile
, "\n");
174 mismatchLengthCount
++;
177 // finally print some statistics
178 fprintf(stdout
,"NOTE: %u series retrieved.\n",seriesCount
);
179 if (mismatchLengthCount
> 0)
180 fprintf(stdout
,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount
);
187 int main(int argc
, char** argv
)
189 if (argc
< 4) //program name + 3 arguments
191 printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
192 - ifileName: name of by-columns CSV input file\n \
193 - posID: position of the identifier in a line (start at 1)\n \
194 - posValue: position of the value of interest in a line\n \
195 - ofileName: name of the output file; default: out.csv\n \
196 - nbItems: number of series to retrieve; default: 0 (all)\n \
197 - sep: fields separator; default: ','\n");
202 return transform(argv
[1], atoi(argv
[2]), atoi(argv
[3]),
203 argc
> 4 ? argv
[4] : "out.csv",
204 argc
> 5 ? atoi(argv
[5]) : 0,
205 argc
> 6 ? argv
[6][0] : ',');