prepare converter for DB extracts datasets
[epclust.git] / data / preprocessing / serialize.c
diff --git a/data/preprocessing/serialize.c b/data/preprocessing/serialize.c
deleted file mode 100644 (file)
index f35da64..0000000
+++ /dev/null
@@ -1,244 +0,0 @@
-#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
-#include <inttypes.h>
-#include <cgds/Vector.h>
-#include <string.h>
-#include <math.h>
-#include <float.h>
-
-char readInt(FILE* stream, int64_t* integer)
-{
-       *integer = 0;
-       char curChar = fgetc(stream);
-       int sign = (curChar == '-' ? -1 : 1);
-       while (curChar < '0' || curChar > '9')
-               curChar = fgetc(stream);
-       ungetc(curChar, stream);
-       while ((curChar = fgetc(stream)) >= '0' && curChar <= '9')
-               *integer = 10 * (*integer) + (int64_t) (curChar - '0');
-       (*integer) *= sign;
-       return curChar;
-}
-
-char readReal(FILE* stream, float* real)
-{
-       int64_t integerPart;
-       char nextChar = readInt(stream, &integerPart);
-       int64_t fractionalPart = 0;
-       int countZeros = 0;
-       if (nextChar == '.')
-       {
-               //need to count zeros
-               while ((nextChar = fgetc(stream)) == '0')
-                       countZeros++;
-               if (nextChar >= '1' && nextChar <= '9')
-               {
-                       ungetc(nextChar, stream);
-                       nextChar = readInt(stream, &fractionalPart);
-               }
-       }
-       int64_t exponent = 0;
-       if (nextChar == 'e' || nextChar == 'E')
-               nextChar = readInt(stream, &exponent);
-       *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
-               / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
-                       * pow(10,exponent);
-       return nextChar;
-}
-
-// Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower)
-{
-       char nextChar;
-       int position = 1;
-       while (1)
-       {
-               if (position == posID)
-               {
-                       int64_t ID_on64bits;
-                       nextChar = readInt(ifile, &ID_on64bits);
-                       *ID = (uint32_t)ID_on64bits;
-               }
-               else if (position == posPower)
-               {
-                       float power = FLT_MAX; //"NA"
-                       nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?!
-                       *rawPower = (float) power;
-               }
-               else
-                       //erase the comma (and skip field then)
-                       nextChar = fgetc(ifile);
-
-               //continue until next comma (or line end or file end)
-               while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',')
-                       nextChar = fgetc(ifile);
-               position++;
-
-               if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
-               {
-                       // skip all potential line feeds
-                       while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
-                               nextChar = fgetc(ifile);
-                       if (!feof(ifile))
-                               ungetc(nextChar, ifile);
-                       break;
-               }
-       }
-}
-
-// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
-void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
-{
-       // Use the header to know positions of ID and rawPower
-       FILE* ifile = fopen(ifileName, "r");
-       uint32_t headerShift = 0;
-       char curChar;
-       Vector* header = vector_new(char);
-       do
-       {
-               curChar = fgetc(ifile);
-               headerShift++;
-               if (curChar == '\n' || curChar == '\r')
-               {
-                       // Flush all potential other line feeds
-                       while (curChar == '\n' || curChar == '\r')
-                               curChar = fgetc(ifile);
-                       ungetc(curChar, ifile);
-                       break;
-               }
-               vector_push(header, curChar);
-       }
-       while (1);
-       char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char));
-       VectorIterator* it = vector_get_iterator(header);
-       int index = 0;
-       while (vectorI_has_data(it))
-       {
-               vectorI_get(it, headerString[index]);
-               vectorI_move_next(it);
-               index++;
-       }
-       vectorI_destroy(it);
-       headerString[index] = 0;
-       vector_destroy(header);
-       int position = 1, posID = 0, posPower = 0;
-       char* columnDescriptor = strtok(headerString, ",");
-       while (columnDescriptor != NULL)
-       {
-               if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id"))
-                       posID = position;
-               else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE"))
-                       posPower = position;
-               position++;
-               columnDescriptor = strtok(NULL, ",");
-       }
-       free(headerString);
-
-       // Estimate tsLength with a scan of the 3 first series
-       uint32_t ID=0, lastID=0, refTsLength=0;
-       float rawPower = 0.;
-       scan_line(ifile, posID, &ID, posPower, &rawPower);
-       //'sl' = sample lengths (short because a lot of comparisons then)
-       uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t));
-       for (int i=0; i<3; i++)
-       {
-               lastID = ID;
-               while (ID == lastID)
-               {
-                       sl[i]++;
-                       scan_line(ifile, posID, &ID, posPower, &rawPower);
-               }
-       }
-       if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2])
-               refTsLength = sl[2];
-       else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0])
-               refTsLength = sl[0];
-       else
-               refTsLength = sl[1];
-       free(sl);
-       //go back at the beginning of the first series (ready to read '\n'...)
-       fseek(ifile, headerShift-1, SEEK_SET);
-
-       // output file to write time-series sequentially, CSV format.
-       FILE* ofile = fopen(ofileName, "w");
-
-       // process one client (ID in first column) at a time
-       uint64_t processedLines = 0; //execution trace
-       uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
-       uint32_t mismatchLengthCount=0;
-       float tsBuffer[refTsLength];
-       lastID = 0;
-       while (!feof(ifile))
-       {
-               // next element to read always start with a digit
-               do
-                       curChar = fgetc(ifile);
-               while (!feof(ifile) && (curChar < '0' || curChar > '9'));
-               if (feof(ifile))
-                       break;
-               ungetc(curChar, ifile);
-
-               // read line
-               scan_line(ifile, posID, &ID, posPower, &rawPower);
-               if (ID != lastID)
-               {
-                       //just starting a new time-series: must process the last one (if there is a last one !)
-                       if (lastID > 0)
-                       {
-                               if (tsLength == refTsLength)
-                               {
-                                       for (int i=0; i<tsLength; i++)
-                                       {
-                                               char* format = i<tsLength-1 ? "%g," : "%g";
-                                               fprintf(ofile, format, tsBuffer[i]);
-                                       }
-                                       fprintf(ofile, "\n");
-                                       if (nbItems > 0 && ++seriesCount >= nbItems)
-                                               break;
-                               }
-                               //if something wrong happened, skip series
-                               else
-                               {
-                                       skippedSeriesCount++;
-                                       if (tsLength != refTsLength)
-                                               mismatchLengthCount++;
-                               }
-                       }
-
-                       // reinitialize flags
-                       tsLength = 0;
-                       lastID = ID;
-               }
-
-               //We cannot write more than refTsLength values
-               if (tsLength < refTsLength)
-                       tsBuffer[tsLength++] = rawPower;
-
-               if ((++processedLines) % 1000000 == 0)
-                       fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
-       }
-
-       if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems))
-       {
-               // flush last time-series if all conditions are met
-               for (int i=0; i<tsLength; i++)
-               {
-                       char* format = i<tsLength-1 ? "%g," : "%g";
-                       fprintf(ofile, format, tsBuffer[i]);
-               }
-               fprintf(ofile, "\n");
-               seriesCount++;
-       }
-       else if (nbItems <= 0 || seriesCount < nbItems)
-       {
-               if (tsLength != refTsLength)
-                       mismatchLengthCount++;
-       }
-
-       // finally print some statistics
-       if (seriesCount < nbItems)
-               fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount);
-       fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount);
-
-       fclose(ifile);
-       fclose(ofile);
-}