#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers #include #include #include #include #include char readInt(FILE* stream, int64_t* integer) { *integer = 0; char curChar = fgetc(stream); int sign = (curChar == '-' ? -1 : 1); while (curChar < '0' || curChar > '9') curChar = fgetc(stream); ungetc(curChar, stream); while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') *integer = 10 * (*integer) + (int64_t) (curChar - '0'); (*integer) *= sign; return curChar; } char readReal(FILE* stream, float* real) { int64_t integerPart; char nextChar = readInt(stream, &integerPart); int64_t fractionalPart = 0; int countZeros = 0; if (nextChar == '.') { //need to count zeros while ((nextChar = fgetc(stream)) == '0') countZeros++; if (nextChar >= '1' && nextChar <= '9') { ungetc(nextChar, stream); nextChar = readInt(stream, &fractionalPart); } } int64_t exponent = 0; if (nextChar == 'e' || nextChar == 'E') nextChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) * pow(10,exponent); return nextChar; } // Parse a line into integer+float (ID, raw power) static void scan_line(FILE* ifile, int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) { char nextChar; int position = 1; while (1) { if (position == posTime) { //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) } else if (position == posID) { int64_t ID_on64bits; nextChar = readInt(ifile, &ID_on64bits); *ID = (uint32_t)ID_on64bits; } else if (position == posPower) { float power = FLT_MAX; //"NA" nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! *rawPower = (float) power; } else //erase the comma (and skip field then) nextChar = fgetc(ifile); //continue until next comma (or line end or file end) while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') nextChar = fgetc(ifile); position++; if (feof(ifile) || nextChar == '\n' || nextChar == '\r') { // skip all potential line feeds while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') nextChar = fgetc(ifile); if (!feof(ifile)) ungetc(nextChar, ifile); break; } } } //TODO: check datetime at each line (build datetimes file ! for each year ?) //also fill NA with closest value in file (easy) //01JAN2009:00:00:00 .......... // Main job: parse a data file into a conventional CSV file in rows, without header void transform(const char* ifileName, int posID, int posTime, int posValue, char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems { //TODO: complete timedate vector from first_time and last_time // --> this gives (expected) tsLength for free FILE* ifile = fopen(ifileName, "r"); // output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); // Skip header char nextChar; do nextChar = fgetc(ifile); while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') // process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; uint32_t mismatchLengthCount=0; float tsBuffer[refTsLength]; lastID = 0; while (!feof(ifile)) { // next element to read always start with a digit do curChar = fgetc(ifile); while (!feof(ifile) && (curChar < '0' || curChar > '9')); if (feof(ifile)) break; ungetc(curChar, ifile); // read line scan_line(ifile, posID, &ID, posPower, &rawPower); if (ID != lastID) { //just starting a new time-series: must process the last one (if there is a last one !) if (lastID > 0) { if (tsLength == refTsLength) { for (int i=0; i 0 && ++seriesCount >= nbItems) break; } //if something wrong happened, skip series else { skippedSeriesCount++; if (tsLength != refTsLength) mismatchLengthCount++; } } // reinitialize flags tsLength = 0; lastID = ID; } //We cannot write more than refTsLength values if (tsLength < refTsLength) tsBuffer[tsLength++] = rawPower; if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) { // flush last time-series if all conditions are met for (int i=0; i