#include <math.h>
#include <float.h>
-char readInt(FILE* stream, int64_t* integer)
+// Read an integer char by char, and position the cursor to next character
+char readInt(FILE* stream, int* integer)
{
*integer = 0;
- char curChar = fgetc(stream);
- int sign = (curChar == '-' ? -1 : 1);
- while (curChar < '0' || curChar > '9')
- curChar = fgetc(stream);
- ungetc(curChar, stream);
- while ((curChar = fgetc(stream)) >= '0' && curChar <= '9')
- *integer = 10 * (*integer) + (int64_t) (curChar - '0');
+ char nextChar = fgetc(stream);
+ int sign = (nextChar == '-' ? -1 : 1);
+ while (nextChar < '0' || nextChar > '9')
+ nextChar = fgetc(stream);
+ ungetc(nextChar, stream);
+ while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
+ *integer = 10 * (*integer) + (int) (nextChar - '0');
(*integer) *= sign;
- return curChar;
+ return nextChar;
}
+// Read a real number char by char, and position the cursor to next character
char readReal(FILE* stream, float* real)
{
- int64_t integerPart;
+ int integerPart;
char nextChar = readInt(stream, &integerPart);
- int64_t fractionalPart = 0;
+ int fractionalPart = 0;
int countZeros = 0;
if (nextChar == '.')
{
nextChar = readInt(stream, &fractionalPart);
}
}
- int64_t exponent = 0;
+ int exponent = 0;
if (nextChar == 'e' || nextChar == 'E')
nextChar = readInt(stream, &exponent);
*real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
return nextChar;
}
-// Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile,
- int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
+// Parse a line into integer+float (ID, value)
+static void scan_line(FILE* ifile, char sep,
+ int posID, int* ID, int posValue, float* value)
{
char nextChar;
int position = 1;
while (1)
{
- if (position == posTime)
- {
- //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
- }
- else if (position == posID)
- {
- int64_t ID_on64bits;
- nextChar = readInt(ifile, &ID_on64bits);
- *ID = (uint32_t)ID_on64bits;
- }
- else if (position == posPower)
- {
- float power = FLT_MAX; //"NA"
- nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?!
- *rawPower = (float) power;
- }
+ if (position == posID)
+ nextChar = readInt(ifile, ID);
+ else if (position == posValue)
+ nextChar = readReal(ifile, value);
else
- //erase the comma (and skip field then)
- nextChar = fgetc(ifile);
+ nextChar = fgetc(ifile); //erase the comma (and skip field then)
- //continue until next comma (or line end or file end)
- while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',')
+ // Continue until next separator (or line end or file end)
+ while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
nextChar = fgetc(ifile);
position++;
if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
{
- // skip all potential line feeds
+ // Skip all potential line feeds
while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
nextChar = fgetc(ifile);
if (!feof(ifile))
}
}
-
-//TODO: check datetime at each line (build datetimes file ! for each year ?)
-//also fill NA with closest value in file (easy)
-//01JAN2009:00:00:00 ..........
-
-
// Main job: parse a data file into a conventional CSV file in rows, without header
-void transform(const char* ifileName, int posID, int posTime, int posValue,
- char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
+// Current limitations:
+// - remove partial series (we could fill missing values instead)
+// - consider missing fields == 0
+// - IDs should be integers
+int transform(const char* ifileName, int posID, int posValue,
+ const char* ofileName, int nbItems, char sep)
{
- //TODO: complete timedate vector from first_time and last_time
- // --> this gives (expected) tsLength for free
-
FILE* ifile = fopen(ifileName, "r");
- // output file to write time-series sequentially, CSV format.
+ // Output file to write time-series sequentially, CSV format.
FILE* ofile = fopen(ofileName, "w");
// Skip header
- char nextChar;
+ char curChar;
do
- nextChar = fgetc(ifile);
- while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
+ curChar = fgetc(ifile);
+ while (!feof(ifile) && curChar != '\n' && curChar != '\r');
- // process one client (ID in first column) at a time
+ // Process one client (ID in first column) at a time
uint64_t processedLines = 0; //execution trace
- uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
- uint32_t mismatchLengthCount=0;
- float tsBuffer[refTsLength];
- lastID = 0;
+ uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
+ int tsLength=0, refTsLength=0, lastID=0, ID=0;
+ float value=0., tsBuffer[refTsLength];
while (!feof(ifile))
{
- // next element to read always start with a digit
- do
+ // Go to next line
+ while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
curChar = fgetc(ifile);
- while (!feof(ifile) && (curChar < '0' || curChar > '9'));
if (feof(ifile))
break;
ungetc(curChar, ifile);
- // read line
- scan_line(ifile, posID, &ID, posPower, &rawPower);
+ // Read current line
+ scan_line(ifile, sep, posID, &ID, posValue, &value);
if (ID != lastID)
{
- //just starting a new time-series: must process the last one (if there is a last one !)
+ // Just starting a new time-series: must process the last one (if exists !)
if (lastID > 0)
{
+ if (refTsLength == 0)
+ refTsLength = tsLength; //first serie is considered clean
if (tsLength == refTsLength)
{
for (int i=0; i<tsLength; i++)
{
- char* format = i<tsLength-1 ? "%g," : "%g";
- fprintf(ofile, format, tsBuffer[i]);
+ char* format = i<tsLength-1 ? "%g%c" : "%g";
+ fprintf(ofile, format, tsBuffer[i], sep);
}
fprintf(ofile, "\n");
if (nbItems > 0 && ++seriesCount >= nbItems)
break;
}
- //if something wrong happened, skip series
+ // Mismatch lengths: skip series
else
- {
- skippedSeriesCount++;
- if (tsLength != refTsLength)
- mismatchLengthCount++;
- }
+ mismatchLengthCount++;
}
// reinitialize flags
//We cannot write more than refTsLength values
if (tsLength < refTsLength)
- tsBuffer[tsLength++] = rawPower;
+ tsBuffer[tsLength++] = value;
if ((++processedLines) % 1000000 == 0)
fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
}
- if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems))
+ if (nbItems <= 0 || seriesCount < nbItems)
{
// flush last time-series if all conditions are met
- for (int i=0; i<tsLength; i++)
+ if (tsLength == refTsLength)
{
- char* format = i<tsLength-1 ? "%g," : "%g";
- fprintf(ofile, format, tsBuffer[i]);
+ for (int i=0; i<tsLength; i++)
+ {
+ char* format = i<tsLength-1 ? "%g%c" : "%g";
+ fprintf(ofile, format, tsBuffer[i], sep);
+ }
+ fprintf(ofile, "\n");
+ seriesCount++;
}
- fprintf(ofile, "\n");
- seriesCount++;
- }
- else if (nbItems <= 0 || seriesCount < nbItems)
- {
- if (tsLength != refTsLength)
+ else
mismatchLengthCount++;
}
// finally print some statistics
- if (seriesCount < nbItems)
- fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount);
- fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount);
+ fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
+ if (mismatchLengthCount > 0)
+ fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
fclose(ifile);
fclose(ofile);
+ return 0;
}
-int main(char** argv, int argc)
+int main(int argc, char** argv)
{
- //TODO: args checks...
- transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
- argv[5], argv[6], argv[7], atoi(argv[8]));
- return 0;
+ if (argc < 4) //program name + 3 arguments
+ {
+ printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
+ - ifileName: name of by-columns CSV input file\n \
+ - posID: position of the identifier in a line (start at 1)\n \
+ - posValue: position of the value of interest in a line\n \
+ - ofileName: name of the output file; default: out.csv\n \
+ - nbItems: number of series to retrieve; default: 0 (all)\n \
+ - sep: fields separator; default: ','\n");
+ return 0;
+ }
+ else
+ {
+ return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
+ argc > 4 ? argv[4] : "out.csv",
+ argc > 5 ? atoi(argv[5]) : 0,
+ argc > 6 ? argv[6][0] : ',');
+ }
}