prepare converter for DB extracts datasets
[epclust.git] / data / preprocessing / convert_to_CSV.c
similarity index 66%
rename from data/preprocessing/serialize.c
rename to data/preprocessing/convert_to_CSV.c
index f35da64..a29b7d8 100644 (file)
@@ -46,13 +46,18 @@ char readReal(FILE* stream, float* real)
 }
 
 // Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower)
+static void scan_line(FILE* ifile,
+       int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
 {
        char nextChar;
        int position = 1;
        while (1)
        {
-               if (position == posID)
+               if (position == posTime)
+               {
+                       //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
+               }
+               else if (position == posID)
                {
                        int64_t ID_on64bits;
                        nextChar = readInt(ifile, &ID_on64bits);
@@ -85,82 +90,29 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float*
        }
 }
 
-// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
-void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
+
+//TODO: check datetime at each line (build datetimes file ! for each year ?)
+//also fill NA with closest value in file (easy)
+//01JAN2009:00:00:00 ..........
+
+
+// Main job: parse a data file into a conventional CSV file in rows, without header
+void transform(const char* ifileName, int posID, int posTime, int posValue,
+       char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
 {
-       // Use the header to know positions of ID and rawPower
-       FILE* ifile = fopen(ifileName, "r");
-       uint32_t headerShift = 0;
-       char curChar;
-       Vector* header = vector_new(char);
-       do
-       {
-               curChar = fgetc(ifile);
-               headerShift++;
-               if (curChar == '\n' || curChar == '\r')
-               {
-                       // Flush all potential other line feeds
-                       while (curChar == '\n' || curChar == '\r')
-                               curChar = fgetc(ifile);
-                       ungetc(curChar, ifile);
-                       break;
-               }
-               vector_push(header, curChar);
-       }
-       while (1);
-       char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char));
-       VectorIterator* it = vector_get_iterator(header);
-       int index = 0;
-       while (vectorI_has_data(it))
-       {
-               vectorI_get(it, headerString[index]);
-               vectorI_move_next(it);
-               index++;
-       }
-       vectorI_destroy(it);
-       headerString[index] = 0;
-       vector_destroy(header);
-       int position = 1, posID = 0, posPower = 0;
-       char* columnDescriptor = strtok(headerString, ",");
-       while (columnDescriptor != NULL)
-       {
-               if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id"))
-                       posID = position;
-               else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE"))
-                       posPower = position;
-               position++;
-               columnDescriptor = strtok(NULL, ",");
-       }
-       free(headerString);
-
-       // Estimate tsLength with a scan of the 3 first series
-       uint32_t ID=0, lastID=0, refTsLength=0;
-       float rawPower = 0.;
-       scan_line(ifile, posID, &ID, posPower, &rawPower);
-       //'sl' = sample lengths (short because a lot of comparisons then)
-       uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t));
-       for (int i=0; i<3; i++)
-       {
-               lastID = ID;
-               while (ID == lastID)
-               {
-                       sl[i]++;
-                       scan_line(ifile, posID, &ID, posPower, &rawPower);
-               }
-       }
-       if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2])
-               refTsLength = sl[2];
-       else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0])
-               refTsLength = sl[0];
-       else
-               refTsLength = sl[1];
-       free(sl);
-       //go back at the beginning of the first series (ready to read '\n'...)
-       fseek(ifile, headerShift-1, SEEK_SET);
+       //TODO: complete timedate vector from first_time and last_time
+       // --> this gives (expected) tsLength for free
 
+       FILE* ifile = fopen(ifileName, "r");
        // output file to write time-series sequentially, CSV format.
        FILE* ofile = fopen(ofileName, "w");
 
+       // Skip header
+       char nextChar;
+       do
+               nextChar = fgetc(ifile);
+       while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
+
        // process one client (ID in first column) at a time
        uint64_t processedLines = 0; //execution trace
        uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
@@ -242,3 +194,11 @@ void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
        fclose(ifile);
        fclose(ofile);
 }
+
+int main(char** argv, int argc)
+{
+       //TODO: args checks...
+       transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
+               argv[5], argv[6], argv[7], atoi(argv[8]));
+       return 0;
+}