X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=data%2Fpreprocessing%2Fconvert_to_CSV.c;h=34cb6e453adc10205e89a87099600b5f13f40984;hb=a0fa5bd0324ecd9bf92e9940e98344f7ee4b2509;hp=a29b7d87e343e39164cbeedb9f19db0f52319d96;hpb=86223e279a954d946ae641888f5107ed9feb6217;p=epclust.git diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c index a29b7d8..34cb6e4 100644 --- a/data/preprocessing/convert_to_CSV.c +++ b/data/preprocessing/convert_to_CSV.c @@ -5,25 +5,27 @@ #include #include -char readInt(FILE* stream, int64_t* integer) +// Read an integer char by char, and position the cursor to next character +char readInt(FILE* stream, int* integer) { *integer = 0; - char curChar = fgetc(stream); - int sign = (curChar == '-' ? -1 : 1); - while (curChar < '0' || curChar > '9') - curChar = fgetc(stream); - ungetc(curChar, stream); - while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') - *integer = 10 * (*integer) + (int64_t) (curChar - '0'); + char nextChar = fgetc(stream); + int sign = (nextChar == '-' ? -1 : 1); + while (nextChar < '0' || nextChar > '9') + nextChar = fgetc(stream); + ungetc(nextChar, stream); + while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9') + *integer = 10 * (*integer) + (int) (nextChar - '0'); (*integer) *= sign; - return curChar; + return nextChar; } +// Read a real number char by char, and position the cursor to next character char readReal(FILE* stream, float* real) { - int64_t integerPart; + int integerPart; char nextChar = readInt(stream, &integerPart); - int64_t fractionalPart = 0; + int fractionalPart = 0; int countZeros = 0; if (nextChar == '.') { @@ -36,7 +38,7 @@ char readReal(FILE* stream, float* real) nextChar = readInt(stream, &fractionalPart); } } - int64_t exponent = 0; + int exponent = 0; if (nextChar == 'e' || nextChar == 'E') nextChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart @@ -45,42 +47,29 @@ char readReal(FILE* stream, float* real) return nextChar; } -// Parse a line into integer+float (ID, raw power) -static void scan_line(FILE* ifile, - int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) +// Parse a line into integer+float (ID, value) +static void scan_line(FILE* ifile, char sep, + int posID, int* ID, int posValue, float* value) { char nextChar; int position = 1; while (1) { - if (position == posTime) - { - //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) - } - else if (position == posID) - { - int64_t ID_on64bits; - nextChar = readInt(ifile, &ID_on64bits); - *ID = (uint32_t)ID_on64bits; - } - else if (position == posPower) - { - float power = FLT_MAX; //"NA" - nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! - *rawPower = (float) power; - } + if (position == posID) + nextChar = readInt(ifile, ID); + else if (position == posValue) + nextChar = readReal(ifile, value); else - //erase the comma (and skip field then) - nextChar = fgetc(ifile); + nextChar = fgetc(ifile); //erase the comma (and skip field then) - //continue until next comma (or line end or file end) - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') + // Continue until next separator (or line end or file end) + while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep) nextChar = fgetc(ifile); position++; if (feof(ifile) || nextChar == '\n' || nextChar == '\r') { - // skip all potential line feeds + // Skip all potential line feeds while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') nextChar = fgetc(ifile); if (!feof(ifile)) @@ -90,70 +79,61 @@ static void scan_line(FILE* ifile, } } - -//TODO: check datetime at each line (build datetimes file ! for each year ?) -//also fill NA with closest value in file (easy) -//01JAN2009:00:00:00 .......... - - // Main job: parse a data file into a conventional CSV file in rows, without header -void transform(const char* ifileName, int posID, int posTime, int posValue, - char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems +// Current limitations: +// - remove partial series (we could fill missing values instead) +// - consider missing fields == 0 +// - IDs should be integers +int transform(const char* ifileName, int posID, int posValue, + const char* ofileName, int nbItems, char sep) { - //TODO: complete timedate vector from first_time and last_time - // --> this gives (expected) tsLength for free - FILE* ifile = fopen(ifileName, "r"); - // output file to write time-series sequentially, CSV format. + // Output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); // Skip header - char nextChar; + char curChar; do - nextChar = fgetc(ifile); - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') + curChar = fgetc(ifile); + while (!feof(ifile) && curChar != '\n' && curChar != '\r'); - // process one client (ID in first column) at a time + // Process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace - uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; - uint32_t mismatchLengthCount=0; - float tsBuffer[refTsLength]; - lastID = 0; + uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; + int tsLength=0, refTsLength=0, lastID=0, ID=0; + float value=0., tsBuffer[refTsLength]; while (!feof(ifile)) { - // next element to read always start with a digit - do + // Go to next line + while (!feof(ifile) && (curChar == '\n' || curChar == '\r')) curChar = fgetc(ifile); - while (!feof(ifile) && (curChar < '0' || curChar > '9')); if (feof(ifile)) break; ungetc(curChar, ifile); - // read line - scan_line(ifile, posID, &ID, posPower, &rawPower); + // Read current line + scan_line(ifile, sep, posID, &ID, posValue, &value); if (ID != lastID) { - //just starting a new time-series: must process the last one (if there is a last one !) + // Just starting a new time-series: must process the last one (if exists !) if (lastID > 0) { + if (refTsLength == 0) + refTsLength = tsLength; //first serie is considered clean if (tsLength == refTsLength) { for (int i=0; i 0 && ++seriesCount >= nbItems) break; } - //if something wrong happened, skip series + // Mismatch lengths: skip series else - { - skippedSeriesCount++; - if (tsLength != refTsLength) - mismatchLengthCount++; - } + mismatchLengthCount++; } // reinitialize flags @@ -163,42 +143,57 @@ void transform(const char* ifileName, int posID, int posTime, int posValue, //We cannot write more than refTsLength values if (tsLength < refTsLength) - tsBuffer[tsLength++] = rawPower; + tsBuffer[tsLength++] = value; if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } - if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) + if (nbItems <= 0 || seriesCount < nbItems) { // flush last time-series if all conditions are met - for (int i=0; i 0) + fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); fclose(ifile); fclose(ofile); + return 0; } -int main(char** argv, int argc) +int main(int argc, char** argv) { - //TODO: args checks... - transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), - argv[5], argv[6], argv[7], atoi(argv[8])); - return 0; + if (argc < 4) //program name + 3 arguments + { + printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ + - ifileName: name of by-columns CSV input file\n \ + - posID: position of the identifier in a line (start at 1)\n \ + - posValue: position of the value of interest in a line\n \ + - ofileName: name of the output file; default: out.csv\n \ + - nbItems: number of series to retrieve; default: 0 (all)\n \ + - sep: fields separator; default: ','\n"); + return 0; + } + else + { + return transform(argv[1], atoi(argv[2]), atoi(argv[3]), + argc > 4 ? argv[4] : "out.csv", + argc > 5 ? atoi(argv[5]) : 0, + argc > 6 ? argv[6][0] : ','); + } }