X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=code%2Fstage1%2Fsrc%2FTimeSeries%2Fserialize.c;h=3caa37127971151505e18533e67221cb3dc19c18;hb=ebf1280e432d51f47238ce8df86750ba3a7d6d1f;hp=88b15f1bd2c58de0295fbd6c602dd6009147315e;hpb=aa7daeaacfda268c392adf1c5efbccea77be9fe0;p=epclust.git diff --git a/code/stage1/src/TimeSeries/serialize.c b/code/stage1/src/TimeSeries/serialize.c index 88b15f1..3caa371 100644 --- a/code/stage1/src/TimeSeries/serialize.c +++ b/code/stage1/src/TimeSeries/serialize.c @@ -8,7 +8,7 @@ #include // parse a line into two integers (ID, raw power) -static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, uint32_t* rawPower) +static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower) { char nextChar; int position = 1; @@ -22,16 +22,14 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, uint32 } else if (position == posPower) { - Real untruncatedPower; - nextChar = readReal(ifile, &untruncatedPower); - if (untruncatedPower < 0.0) - untruncatedPower = 0.0; - *rawPower = (uint32_t) floor(untruncatedPower*10.0); + Real power; + nextChar = readReal(ifile, &power); + *rawPower = (float) power; } else //erase the comma (and skip field then) nextChar = fgetc(ifile); - + //continue until next comma (or line end or file end) while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') nextChar = fgetc(ifile); @@ -99,7 +97,8 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI free(headerString); //estimate tsLength with a scan of the 3 first series - uint32_t ID=0, rawPower=0, lastID=0, refTsLength=0; + uint32_t ID=0, lastID=0, refTsLength=0; + float rawPower = 0.0; scan_line(ifile, posID, &ID, posPower, &rawPower); //'sl' = sample lengths (short because a lot of comparisons then) uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t)); @@ -108,7 +107,7 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI lastID = ID; while (ID == lastID) { - sl[i]++; + sl[i]++; scan_line(ifile, posID, &ID, posPower, &rawPower); } } @@ -124,9 +123,7 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI // output file to write time-series sequentially, binary format. // Format: for each series, ID is stored on 4 bytes (unsigned integer32). Then, - // ()+ follow, with rawPower stored as a "3 bytes int" - // rawPower values are multiplied by 10 and truncated one digit after 0 - // NOTE: no raw power should be exactly zero + // ()+ follow, with rawPower stored as a float FILE* ofile = fopen(ofileName, "wb"); // leave space to write the number of series (32bits), and their length in bytes (32bits) @@ -136,9 +133,8 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI // process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; - uint32_t mismatchLengthCount=0, overflowCount=0; - Byte tsBuffer[4+3*refTsLength]; - int overflow = 0; + uint32_t mismatchLengthCount=0; + Byte tsBuffer[4+4*refTsLength]; lastID = 0; while (!feof(ifile)) { @@ -157,10 +153,10 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI //just starting a new time-series: must process the last one (if there is a last one !) if (lastID > 0) { - if (tsLength == refTsLength && !overflow) + if (tsLength == refTsLength) { seriesCount++; - fwrite(tsBuffer, 4+3*tsLength, 1, ofile); + fwrite(tsBuffer, 4+4*tsLength, 1, ofile); if (nbItems > 0 && seriesCount >= nbItems) break; } @@ -170,62 +166,56 @@ void serialize_byCols(const char* ifileName, const char* ofileName, uint32_t nbI skippedSeriesCount++; if (tsLength != refTsLength) mismatchLengthCount++; - if (overflow) - overflowCount++; } } - + // ID for the new series is printed only once: - write_int(ID, 4, tsBuffer); + write_int(ID, tsBuffer); // reinitialize flags - overflow = 0; tsLength = 0; lastID = ID; } - overflow = (overflow || (rawPower >= (1 << 24))); //We cannot write more than refTsLength bytes if (tsLength < refTsLength) - write_int(rawPower, 3, tsBuffer + 4+3*tsLength); + write_real(rawPower, tsBuffer + 4+4*tsLength); tsLength++; - + if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } - if (!overflow && tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) + if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) { // flush last time-series if all conditions are met - fwrite(tsBuffer, 4+3*tsLength, 1, ofile); + fwrite(tsBuffer, 4+4*tsLength, 1, ofile); seriesCount++; } else if (nbItems <= 0 || seriesCount < nbItems) { if (tsLength != refTsLength) mismatchLengthCount++; - if (overflow) - overflowCount++; } // write lines count and size of a time-series in bytes Byte intBuffer[4]; fseek(ofile, 0, SEEK_SET); - write_int(seriesCount, 4, intBuffer); + write_int(seriesCount, intBuffer); fwrite(intBuffer, 1, 4, ofile); // re-express tsLength in bytes (not forgetting the ID)) - write_int(4 + 3 * refTsLength, 4, intBuffer); + write_int(4 + 4 * refTsLength, intBuffer); fwrite(intBuffer, 1, 4, ofile); // finally print some statistics if (seriesCount < nbItems) fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount); - fprintf(stdout,"%u overflows / %u mismatch series lengths.\n",overflowCount,mismatchLengthCount); - + fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount); + fclose(ifile); fclose(ofile); } -//serialize from usual 'by-row' data (for StarLight example and toy dataset) +//serialize from usual 'by-row' data void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbItems) { FILE* ifile = fopen(ifileName, "r"); @@ -239,6 +229,7 @@ void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbI { nbValues++; //skip potential consecutive commas (could be hard to spot) + //TODO(...): should be 'NA' in R, thus an error (we don't handle NAs)... while (curChar == ',') curChar = fgetc(ifile); ungetc(curChar, ifile); @@ -256,7 +247,7 @@ void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbI fseek(ifile, 0, SEEK_SET); //write meta info first - uint32_t tsLength = 3*nbValues+4; + uint32_t tsLength = 4*nbValues+4; FILE* ofile = fopen(ofileName, "wb"); Byte intBuffer[4]; write_int(nbSeries, 4, intBuffer); @@ -265,10 +256,7 @@ void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbI fwrite(intBuffer, 1, 4, ofile); Real rawPower; int64_t ID; - - //DEBUG / TEST (ugly, TOFIX...) - double minrp = INFINITY, maxrp = -INFINITY; - + for (uint32_t i=0; i maxrp) - maxrp = rawPower; - - write_int((uint32_t)floor(10.0*(rawPower+0.0)), 3, intBuffer); //x10... +3... - fwrite(intBuffer, 1, 3, ofile); + write_real(rawPower, intBuffer); + fwrite(intBuffer, 1, 4, ofile); while (curChar == ',') curChar = fgetc(ifile); ungetc(curChar, ifile); @@ -300,7 +281,4 @@ void serialize_byRows(const char* ifileName, const char* ofileName, uint32_t nbI } fclose(ifile); fclose(ofile); - - //DEBUG / TEST (ugly, TOFIX...) - printf("min / max values = %g %g\n",minrp,maxrp); }