From a2fd2d76599672ac6396a0da1ae72007705044cb Mon Sep 17 00:00:00 2001 From: Benjamin Auder <benjamin.auder@somewhere> Date: Sat, 18 Mar 2017 08:13:55 +0100 Subject: [PATCH] fix convert_to_CSV.c --- data/preprocessing/convert_to_CSV.c | 167 ++++++++++---------- data/preprocessing/expected_output_test.csv | 6 +- 2 files changed, 85 insertions(+), 88 deletions(-) diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c index df31f71..646c440 100644 --- a/data/preprocessing/convert_to_CSV.c +++ b/data/preprocessing/convert_to_CSV.c @@ -10,76 +10,66 @@ char readInt(FILE* stream, int* integer) { *integer = 0; - char nextChar = fgetc(stream); - int sign = (nextChar == '-' ? -1 : 1); - while (nextChar < '0' || nextChar > '9') - nextChar = fgetc(stream); - ungetc(nextChar, stream); - while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9') + char curChar = fgetc(stream); + int sign = (curChar == '-' ? -1 : 1); + while (curChar < '0' || curChar > '9') + curChar = fgetc(stream); + while (curChar >= '0' && curChar <= '9') { - - printf("next char: %c\n",nextChar); - - *integer = 10 * (*integer) + (int) (nextChar - '0'); } + *integer = 10 * (*integer) + (int) (curChar - '0'); + curChar = fgetc(stream); + } (*integer) *= sign; -printf("INTEGER: %i\n",*integer); - return nextChar; + return curChar; //separator, endline or .,e,E (if inside readReal) } // Read a real number char by char, and position the cursor to next character char readReal(FILE* stream, float* real) { - int integerPart; - char nextChar = readInt(stream, &integerPart); - int fractionalPart = 0; - int countZeros = 0; - if (nextChar == '.') + int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0; + char curChar = readInt(stream, &integerPart); + if (curChar == '.') { //need to count zeros - while ((nextChar = fgetc(stream)) == '0') + while ((curChar = fgetc(stream)) == '0') countZeros++; - if (nextChar >= '1' && nextChar <= '9') + if (curChar >= '1' && curChar <= '9') { - ungetc(nextChar, stream); - nextChar = readInt(stream, &fractionalPart); + ungetc(curChar, stream); + curChar = readInt(stream, &fractionalPart); } } - int exponent = 0; - if (nextChar == 'e' || nextChar == 'E') - nextChar = readInt(stream, &exponent); + if (curChar == 'e' || curChar == 'E') + curChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) * pow(10,exponent); - return nextChar; + return curChar; //separator or endline } // Parse a line into integer+float (ID, value) static void scan_line(FILE* ifile, char sep, int posID, int* ID, int posValue, float* value) { - char nextChar; + char curChar; int position = 1; while (1) { if (position == posID) - nextChar = readInt(ifile, ID); + curChar = readInt(ifile, ID); else if (position == posValue) - nextChar = readReal(ifile, value); + curChar = readReal(ifile, value); else - nextChar = fgetc(ifile); //erase the comma (and skip field then) + curChar = fgetc(ifile); //erase the comma (and skip field then) // Continue until next separator (or line end or file end) - while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep) - nextChar = fgetc(ifile); + while (!feof(ifile) && curChar != '\n' && curChar != sep) + curChar = fgetc(ifile); position++; - if (feof(ifile) || nextChar == '\n' || nextChar == '\r') + if (curChar == '\n' || feof(ifile)) { - // Skip all potential line feeds - while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') - nextChar = fgetc(ifile); - if (!feof(ifile)) - ungetc(nextChar, ifile); + // Reached end of line break; } } @@ -88,11 +78,17 @@ static void scan_line(FILE* ifile, char sep, // Main job: parse a data file into a conventional CSV file in rows, without header // Current limitations: // - remove partial series (we could fill missing values instead) -// - consider missing fields == 0 -// - IDs should be integers +// - consider missing fields == 0 (if ,,) +// - IDs should be st. pos. integers +// - UNIX linebreaks only (\n) int transform(const char* ifileName, int posID, int posValue, const char* ofileName, int nbItems, char sep) { + uint64_t processedLines = 0; //execution trace + uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; + int tsLength, lastID=0, ID, firstID, eof; + float value, tmpVal; + Vector* tsBuffer = vector_new(float); FILE* ifile = fopen(ifileName, "r"); // Output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); @@ -101,77 +97,78 @@ int transform(const char* ifileName, int posID, int posValue, char curChar; do curChar = fgetc(ifile); - while (!feof(ifile) && curChar != '\n' && curChar != '\r'); + while (curChar != '\n'); // Process one client (ID in first column) at a time - uint64_t processedLines = 0; //execution trace - uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; - int tsLength=0, refTsLength=0, lastID=0, ID=0; - float value=0., tsBuffer[refTsLength]; - while (!feof(ifile)) + while (1) { - // Go to next line - while (!feof(ifile) && (curChar == '\n' || curChar == '\r')) - curChar = fgetc(ifile); - if (feof(ifile)) - break; - ungetc(curChar, ifile); - // Read current line - scan_line(ifile, sep, posID, &ID, posValue, &value); + eof = feof(ifile); + if (!eof) + { + // Is there anything left to read? (file may end with '\n') + curChar = fgetc(ifile); + if (!feof(ifile) && curChar != '\n') + { + // Yes: read current line + ungetc(curChar, ifile); + scan_line(ifile, sep, posID, &ID, posValue, &value); + } + else + eof = 1; + } - printf("SCAN: %i %g\n",ID,value); - if (ID != lastID) + if (ID != lastID || eof) { - // Just starting a new time-series: must process the last one (if exists !) if (lastID > 0) { - if (tsLength == refTsLength) + // Just starting a new time-series (or EOF): process the last one + if (tsLength == vector_size(tsBuffer)) { - for (int i=0; i<tsLength; i++) + for (int i=0; i<tsLength-1; i++) { - char* format = i<tsLength-1 ? "%g%c" : "%g"; - fprintf(ofile, format, tsBuffer[i], sep); + vector_get(tsBuffer, i, tmpVal); + fprintf(ofile, "%g%c", tmpVal, sep); } - fprintf(ofile, "\n"); + vector_get(tsBuffer, tsLength-1, tmpVal); + fprintf(ofile, "%g\n", tmpVal); + seriesCount++; if (nbItems > 0 && ++seriesCount >= nbItems) break; } - // Mismatch lengths: skip series else + { + // Mismatch lengths: skip series mismatchLengthCount++; + } } else - refTsLength = tsLength; //first serie is considered clean - - // reinitialize flags + firstID = ID; + if (eof) + { + // Last serie is processed + break; + } + // Reinitialize current index of new serie tsLength = 0; lastID = ID; } -printf("LA %i %i\n",tsLength,refTsLength); - //We cannot write more than refTsLength values - if (tsLength < refTsLength) - tsBuffer[tsLength++] = value; - - if ((++processedLines) % 1000000 == 0) - fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); - } - if (nbItems <= 0 || seriesCount < nbItems) - { - // flush last time-series if all conditions are met - if (tsLength == refTsLength) + // Fill values buffer + if (ID != firstID) { - for (int i=0; i<tsLength; i++) - { - char* format = i<tsLength-1 ? "%g%c" : "%g"; - fprintf(ofile, format, tsBuffer[i], sep); - } - fprintf(ofile, "\n"); - seriesCount++; + if (tsLength < vector_size(tsBuffer)) + vector_set(tsBuffer, tsLength, value); } else - mismatchLengthCount++; + { + // First serie is reference: push all values + vector_push(tsBuffer, value); + } + tsLength++; + + if ((++processedLines) % 1000000 == 0) + fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } // finally print some statistics diff --git a/data/preprocessing/expected_output_test.csv b/data/preprocessing/expected_output_test.csv index 4cd40d7..f837a25 100644 --- a/data/preprocessing/expected_output_test.csv +++ b/data/preprocessing/expected_output_test.csv @@ -1,3 +1,3 @@ -1.05,2.,3. -1e4,1.,0. -3.25e2,-2.0e3,15.55 +1.05,2,3 +10000,1,0 +325,-2000,15.55 -- 2.44.0