From: Benjamin Auder Date: Tue, 14 Mar 2017 14:59:40 +0000 (+0100) Subject: work on CSV preprocessing X-Git-Url: https://git.auder.net/?p=epclust.git;a=commitdiff_plain;h=a0fa5bd0324ecd9bf92e9940e98344f7ee4b2509 work on CSV preprocessing --- diff --git a/.gitignore b/.gitignore index 255781c..96947f0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,6 @@ /data/* !/data/README !/data/preprocessing/ -/data/preprocessing/* -!/data/preprocessing/convert.c -!/data/preprocessing/Makefile #files generated by initialize.sh /.gitfat @@ -15,7 +12,7 @@ *.swp #ignore binary files generated by claws() -*.bin +*.epclust.bin #ignore R session files .Rhistory @@ -33,9 +30,10 @@ #ignore jupyter generated file (HTML vignette, and reports) *.ipynb.html -#ignore object files +#ignore object files and executables *.o *.so +*.exe #ignore RcppExports, generated by Rcpp::compileAttributes /epclust/R/RcppExports.R diff --git a/data/preprocessing/Makefile b/data/preprocessing/Makefile index 9a4a044..097420e 100644 --- a/data/preprocessing/Makefile +++ b/data/preprocessing/Makefile @@ -1,2 +1,4 @@ -#TODO: depend on cgds... -gcc -o convert_to_CSV convert_to_CSV.c -lm +TARGET = transform.exe + +$(TARGET): convert_to_CSV.c + gcc convert_to_CSV.c -lm -lcgds -o $(TARGET) diff --git a/data/preprocessing/README b/data/preprocessing/README new file mode 100644 index 0000000..1a860df --- /dev/null +++ b/data/preprocessing/README @@ -0,0 +1,4 @@ +Converter from raw by-columns CSV format to by-rows CSV file +(much smaller), as epclust::claws() expect as CSV input. + +Dependency: cgds, https://git.auder.net/?p=cgds.git diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c index a29b7d8..34cb6e4 100644 --- a/data/preprocessing/convert_to_CSV.c +++ b/data/preprocessing/convert_to_CSV.c @@ -5,25 +5,27 @@ #include #include -char readInt(FILE* stream, int64_t* integer) +// Read an integer char by char, and position the cursor to next character +char readInt(FILE* stream, int* integer) { *integer = 0; - char curChar = fgetc(stream); - int sign = (curChar == '-' ? -1 : 1); - while (curChar < '0' || curChar > '9') - curChar = fgetc(stream); - ungetc(curChar, stream); - while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') - *integer = 10 * (*integer) + (int64_t) (curChar - '0'); + char nextChar = fgetc(stream); + int sign = (nextChar == '-' ? -1 : 1); + while (nextChar < '0' || nextChar > '9') + nextChar = fgetc(stream); + ungetc(nextChar, stream); + while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9') + *integer = 10 * (*integer) + (int) (nextChar - '0'); (*integer) *= sign; - return curChar; + return nextChar; } +// Read a real number char by char, and position the cursor to next character char readReal(FILE* stream, float* real) { - int64_t integerPart; + int integerPart; char nextChar = readInt(stream, &integerPart); - int64_t fractionalPart = 0; + int fractionalPart = 0; int countZeros = 0; if (nextChar == '.') { @@ -36,7 +38,7 @@ char readReal(FILE* stream, float* real) nextChar = readInt(stream, &fractionalPart); } } - int64_t exponent = 0; + int exponent = 0; if (nextChar == 'e' || nextChar == 'E') nextChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart @@ -45,42 +47,29 @@ char readReal(FILE* stream, float* real) return nextChar; } -// Parse a line into integer+float (ID, raw power) -static void scan_line(FILE* ifile, - int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) +// Parse a line into integer+float (ID, value) +static void scan_line(FILE* ifile, char sep, + int posID, int* ID, int posValue, float* value) { char nextChar; int position = 1; while (1) { - if (position == posTime) - { - //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) - } - else if (position == posID) - { - int64_t ID_on64bits; - nextChar = readInt(ifile, &ID_on64bits); - *ID = (uint32_t)ID_on64bits; - } - else if (position == posPower) - { - float power = FLT_MAX; //"NA" - nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! - *rawPower = (float) power; - } + if (position == posID) + nextChar = readInt(ifile, ID); + else if (position == posValue) + nextChar = readReal(ifile, value); else - //erase the comma (and skip field then) - nextChar = fgetc(ifile); + nextChar = fgetc(ifile); //erase the comma (and skip field then) - //continue until next comma (or line end or file end) - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') + // Continue until next separator (or line end or file end) + while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep) nextChar = fgetc(ifile); position++; if (feof(ifile) || nextChar == '\n' || nextChar == '\r') { - // skip all potential line feeds + // Skip all potential line feeds while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') nextChar = fgetc(ifile); if (!feof(ifile)) @@ -90,70 +79,61 @@ static void scan_line(FILE* ifile, } } - -//TODO: check datetime at each line (build datetimes file ! for each year ?) -//also fill NA with closest value in file (easy) -//01JAN2009:00:00:00 .......... - - // Main job: parse a data file into a conventional CSV file in rows, without header -void transform(const char* ifileName, int posID, int posTime, int posValue, - char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems +// Current limitations: +// - remove partial series (we could fill missing values instead) +// - consider missing fields == 0 +// - IDs should be integers +int transform(const char* ifileName, int posID, int posValue, + const char* ofileName, int nbItems, char sep) { - //TODO: complete timedate vector from first_time and last_time - // --> this gives (expected) tsLength for free - FILE* ifile = fopen(ifileName, "r"); - // output file to write time-series sequentially, CSV format. + // Output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); // Skip header - char nextChar; + char curChar; do - nextChar = fgetc(ifile); - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') + curChar = fgetc(ifile); + while (!feof(ifile) && curChar != '\n' && curChar != '\r'); - // process one client (ID in first column) at a time + // Process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace - uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; - uint32_t mismatchLengthCount=0; - float tsBuffer[refTsLength]; - lastID = 0; + uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; + int tsLength=0, refTsLength=0, lastID=0, ID=0; + float value=0., tsBuffer[refTsLength]; while (!feof(ifile)) { - // next element to read always start with a digit - do + // Go to next line + while (!feof(ifile) && (curChar == '\n' || curChar == '\r')) curChar = fgetc(ifile); - while (!feof(ifile) && (curChar < '0' || curChar > '9')); if (feof(ifile)) break; ungetc(curChar, ifile); - // read line - scan_line(ifile, posID, &ID, posPower, &rawPower); + // Read current line + scan_line(ifile, sep, posID, &ID, posValue, &value); if (ID != lastID) { - //just starting a new time-series: must process the last one (if there is a last one !) + // Just starting a new time-series: must process the last one (if exists !) if (lastID > 0) { + if (refTsLength == 0) + refTsLength = tsLength; //first serie is considered clean if (tsLength == refTsLength) { for (int i=0; i 0 && ++seriesCount >= nbItems) break; } - //if something wrong happened, skip series + // Mismatch lengths: skip series else - { - skippedSeriesCount++; - if (tsLength != refTsLength) - mismatchLengthCount++; - } + mismatchLengthCount++; } // reinitialize flags @@ -163,42 +143,57 @@ void transform(const char* ifileName, int posID, int posTime, int posValue, //We cannot write more than refTsLength values if (tsLength < refTsLength) - tsBuffer[tsLength++] = rawPower; + tsBuffer[tsLength++] = value; if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } - if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) + if (nbItems <= 0 || seriesCount < nbItems) { // flush last time-series if all conditions are met - for (int i=0; i 0) + fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); fclose(ifile); fclose(ofile); + return 0; } -int main(char** argv, int argc) +int main(int argc, char** argv) { - //TODO: args checks... - transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), - argv[5], argv[6], argv[7], atoi(argv[8])); - return 0; + if (argc < 4) //program name + 3 arguments + { + printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ + - ifileName: name of by-columns CSV input file\n \ + - posID: position of the identifier in a line (start at 1)\n \ + - posValue: position of the value of interest in a line\n \ + - ofileName: name of the output file; default: out.csv\n \ + - nbItems: number of series to retrieve; default: 0 (all)\n \ + - sep: fields separator; default: ','\n"); + return 0; + } + else + { + return transform(argv[1], atoi(argv[2]), atoi(argv[3]), + argc > 4 ? argv[4] : "out.csv", + argc > 5 ? atoi(argv[5]) : 0, + argc > 6 ? argv[6][0] : ','); + } } diff --git a/data/preprocessing/input_test.csv b/data/preprocessing/input_test.csv new file mode 100644 index 0000000..e69de29 diff --git a/data/preprocessing/test_convert.c b/data/preprocessing/test_convert.c new file mode 100644 index 0000000..20a5807 --- /dev/null +++ b/data/preprocessing/test_convert.c @@ -0,0 +1,5 @@ +int main(int argc, char** argv) +{ + execl("transform", "input_test.csv", 1, 3, "output_test.csv", ","); + //TODO: diff, .... http://stackoverflow.com/questions/7292642/grabbing-output-from-exec +}