X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=data%2Fpreprocessing%2Fconvert_to_CSV.c;h=646c4407380ccf396971fc8d89bd96ace8537899;hb=a2fd2d76599672ac6396a0da1ae72007705044cb;hp=a29b7d87e343e39164cbeedb9f19db0f52319d96;hpb=86223e279a954d946ae641888f5107ed9feb6217;p=epclust.git diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c index a29b7d8..646c440 100644 --- a/data/preprocessing/convert_to_CSV.c +++ b/data/preprocessing/convert_to_CSV.c @@ -4,201 +4,201 @@ #include #include #include +#include -char readInt(FILE* stream, int64_t* integer) +// Read an integer char by char, and position the cursor to next character +char readInt(FILE* stream, int* integer) { *integer = 0; char curChar = fgetc(stream); int sign = (curChar == '-' ? -1 : 1); while (curChar < '0' || curChar > '9') curChar = fgetc(stream); - ungetc(curChar, stream); - while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') - *integer = 10 * (*integer) + (int64_t) (curChar - '0'); + while (curChar >= '0' && curChar <= '9') + { + *integer = 10 * (*integer) + (int) (curChar - '0'); + curChar = fgetc(stream); + } (*integer) *= sign; - return curChar; + return curChar; //separator, endline or .,e,E (if inside readReal) } +// Read a real number char by char, and position the cursor to next character char readReal(FILE* stream, float* real) { - int64_t integerPart; - char nextChar = readInt(stream, &integerPart); - int64_t fractionalPart = 0; - int countZeros = 0; - if (nextChar == '.') + int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0; + char curChar = readInt(stream, &integerPart); + if (curChar == '.') { //need to count zeros - while ((nextChar = fgetc(stream)) == '0') + while ((curChar = fgetc(stream)) == '0') countZeros++; - if (nextChar >= '1' && nextChar <= '9') + if (curChar >= '1' && curChar <= '9') { - ungetc(nextChar, stream); - nextChar = readInt(stream, &fractionalPart); + ungetc(curChar, stream); + curChar = readInt(stream, &fractionalPart); } } - int64_t exponent = 0; - if (nextChar == 'e' || nextChar == 'E') - nextChar = readInt(stream, &exponent); + if (curChar == 'e' || curChar == 'E') + curChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) * pow(10,exponent); - return nextChar; + return curChar; //separator or endline } -// Parse a line into integer+float (ID, raw power) -static void scan_line(FILE* ifile, - int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) +// Parse a line into integer+float (ID, value) +static void scan_line(FILE* ifile, char sep, + int posID, int* ID, int posValue, float* value) { - char nextChar; + char curChar; int position = 1; while (1) { - if (position == posTime) - { - //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) - } - else if (position == posID) - { - int64_t ID_on64bits; - nextChar = readInt(ifile, &ID_on64bits); - *ID = (uint32_t)ID_on64bits; - } - else if (position == posPower) - { - float power = FLT_MAX; //"NA" - nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! - *rawPower = (float) power; - } + if (position == posID) + curChar = readInt(ifile, ID); + else if (position == posValue) + curChar = readReal(ifile, value); else - //erase the comma (and skip field then) - nextChar = fgetc(ifile); + curChar = fgetc(ifile); //erase the comma (and skip field then) - //continue until next comma (or line end or file end) - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') - nextChar = fgetc(ifile); + // Continue until next separator (or line end or file end) + while (!feof(ifile) && curChar != '\n' && curChar != sep) + curChar = fgetc(ifile); position++; - if (feof(ifile) || nextChar == '\n' || nextChar == '\r') + if (curChar == '\n' || feof(ifile)) { - // skip all potential line feeds - while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') - nextChar = fgetc(ifile); - if (!feof(ifile)) - ungetc(nextChar, ifile); + // Reached end of line break; } } } - -//TODO: check datetime at each line (build datetimes file ! for each year ?) -//also fill NA with closest value in file (easy) -//01JAN2009:00:00:00 .......... - - // Main job: parse a data file into a conventional CSV file in rows, without header -void transform(const char* ifileName, int posID, int posTime, int posValue, - char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems +// Current limitations: +// - remove partial series (we could fill missing values instead) +// - consider missing fields == 0 (if ,,) +// - IDs should be st. pos. integers +// - UNIX linebreaks only (\n) +int transform(const char* ifileName, int posID, int posValue, + const char* ofileName, int nbItems, char sep) { - //TODO: complete timedate vector from first_time and last_time - // --> this gives (expected) tsLength for free - + uint64_t processedLines = 0; //execution trace + uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; + int tsLength, lastID=0, ID, firstID, eof; + float value, tmpVal; + Vector* tsBuffer = vector_new(float); FILE* ifile = fopen(ifileName, "r"); - // output file to write time-series sequentially, CSV format. + // Output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); // Skip header - char nextChar; + char curChar; do - nextChar = fgetc(ifile); - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') + curChar = fgetc(ifile); + while (curChar != '\n'); - // process one client (ID in first column) at a time - uint64_t processedLines = 0; //execution trace - uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; - uint32_t mismatchLengthCount=0; - float tsBuffer[refTsLength]; - lastID = 0; - while (!feof(ifile)) + // Process one client (ID in first column) at a time + while (1) { - // next element to read always start with a digit - do + + eof = feof(ifile); + if (!eof) + { + // Is there anything left to read? (file may end with '\n') curChar = fgetc(ifile); - while (!feof(ifile) && (curChar < '0' || curChar > '9')); - if (feof(ifile)) - break; - ungetc(curChar, ifile); + if (!feof(ifile) && curChar != '\n') + { + // Yes: read current line + ungetc(curChar, ifile); + scan_line(ifile, sep, posID, &ID, posValue, &value); + } + else + eof = 1; + } - // read line - scan_line(ifile, posID, &ID, posPower, &rawPower); - if (ID != lastID) + if (ID != lastID || eof) { - //just starting a new time-series: must process the last one (if there is a last one !) if (lastID > 0) { - if (tsLength == refTsLength) + // Just starting a new time-series (or EOF): process the last one + if (tsLength == vector_size(tsBuffer)) { - for (int i=0; i 0 && ++seriesCount >= nbItems) break; } - //if something wrong happened, skip series else { - skippedSeriesCount++; - if (tsLength != refTsLength) - mismatchLengthCount++; + // Mismatch lengths: skip series + mismatchLengthCount++; } } - - // reinitialize flags + else + firstID = ID; + if (eof) + { + // Last serie is processed + break; + } + // Reinitialize current index of new serie tsLength = 0; lastID = ID; } - //We cannot write more than refTsLength values - if (tsLength < refTsLength) - tsBuffer[tsLength++] = rawPower; + // Fill values buffer + if (ID != firstID) + { + if (tsLength < vector_size(tsBuffer)) + vector_set(tsBuffer, tsLength, value); + } + else + { + // First serie is reference: push all values + vector_push(tsBuffer, value); + } + tsLength++; if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } - if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) - { - // flush last time-series if all conditions are met - for (int i=0; i 0) + fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); fclose(ifile); fclose(ofile); + return 0; } -int main(char** argv, int argc) +int main(int argc, char** argv) { - //TODO: args checks... - transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), - argv[5], argv[6], argv[7], atoi(argv[8])); - return 0; + if (argc < 4) //program name + 3 arguments + { + printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ + - ifileName: name of by-columns CSV input file\n \ + - posID: position of the identifier in a line (start at 1)\n \ + - posValue: position of the value of interest in a line\n \ + - ofileName: name of the output file; default: out.csv\n \ + - nbItems: number of series to retrieve; default: 0 (all)\n \ + - sep: fields separator; default: ','\n"); + return 0; + } + else + { + return transform(argv[1], atoi(argv[2]), atoi(argv[3]), + argc > 4 ? argv[4] : "out.csv", + argc > 5 ? atoi(argv[5]) : 0, + argc > 6 ? argv[6][0] : ','); + } }