prepare converter for DB extracts datasets
authorBenjamin Auder <benjamin.auder@somewhere>
Mon, 6 Mar 2017 11:50:02 +0000 (12:50 +0100)
committerBenjamin Auder <benjamin.auder@somewhere>
Mon, 6 Mar 2017 11:50:02 +0000 (12:50 +0100)
.gitignore
data/preprocessing/Makefile [new file with mode: 0644]
data/preprocessing/convert_32kEDF.R [deleted file]
data/preprocessing/convert_to_CSV.c [moved from data/preprocessing/serialize.c with 66% similarity]

index dbcc2f0..8db5c77 100644 (file)
@@ -2,6 +2,9 @@
 /data/*
 !/data/README
 !/data/preprocessing/
+/data/prrprocessing/*
+!/data/preprocessing/convert.c
+!/data/preprocessing/Makefile
 
 #files generated by initialize.sh
 /.gitfat
diff --git a/data/preprocessing/Makefile b/data/preprocessing/Makefile
new file mode 100644 (file)
index 0000000..9a4a044
--- /dev/null
@@ -0,0 +1,2 @@
+#TODO: depend on cgds...
+gcc -o convert_to_CSV convert_to_CSV.c -lm
diff --git a/data/preprocessing/convert_32kEDF.R b/data/preprocessing/convert_32kEDF.R
deleted file mode 100644 (file)
index 2e6798a..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-convert_32kEDF = function(orig_csv, nb_series_per_chunk)
-{
-       datetimes = #...TODO: all 3 years? year-by-year is better
-       orig_con = file(orig_csv, open="r") #2009, 2010 or 2011
-       ignored = readLines(orig_con, 1) #skip header
-       serie_length = length(datetimes) #around 365*24*2 = 17520
-       sep = if (year==2009) "," else if (year==2010) ";" else ";"
-
-scan(orig_con, character(), sep=",", nlines=1, quiet=TRUE)
-       library(sqldf, quietly=TRUE)
-       ids = read.csv.sql(file_csv, header = TRUE, sep = ","
-               sql = "select * from file_csv group by FK_CCU_ID")
-       index = 0
-       repeat
-       {
-               if (index+1 >= length(ids))
-                       break
-               request = "select CPP_DATE_PUISSANCE,CPP_PUISSANCE_BRUTE where FK_CCU_ID in ("
-               for (id in ids[index + seq_len(nb_series_per_chunk)])
-                       request = paste(request, id, ",", sep="")
-               request = paste(request, ") order by FK_CCU_ID,CPP_DATE_PUISSANCE", sep="")
-               series_chunk = read.csv.sql(file_csv, header = TRUE, sep = ",", sql = request)
-               
-               index = index + 17520
similarity index 66%
rename from data/preprocessing/serialize.c
rename to data/preprocessing/convert_to_CSV.c
index f35da64..a29b7d8 100644 (file)
@@ -46,13 +46,18 @@ char readReal(FILE* stream, float* real)
 }
 
 // Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower)
+static void scan_line(FILE* ifile,
+       int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
 {
        char nextChar;
        int position = 1;
        while (1)
        {
-               if (position == posID)
+               if (position == posTime)
+               {
+                       //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
+               }
+               else if (position == posID)
                {
                        int64_t ID_on64bits;
                        nextChar = readInt(ifile, &ID_on64bits);
@@ -85,82 +90,29 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float*
        }
 }
 
-// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
-void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
+
+//TODO: check datetime at each line (build datetimes file ! for each year ?)
+//also fill NA with closest value in file (easy)
+//01JAN2009:00:00:00 ..........
+
+
+// Main job: parse a data file into a conventional CSV file in rows, without header
+void transform(const char* ifileName, int posID, int posTime, int posValue,
+       char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
 {
-       // Use the header to know positions of ID and rawPower
-       FILE* ifile = fopen(ifileName, "r");
-       uint32_t headerShift = 0;
-       char curChar;
-       Vector* header = vector_new(char);
-       do
-       {
-               curChar = fgetc(ifile);
-               headerShift++;
-               if (curChar == '\n' || curChar == '\r')
-               {
-                       // Flush all potential other line feeds
-                       while (curChar == '\n' || curChar == '\r')
-                               curChar = fgetc(ifile);
-                       ungetc(curChar, ifile);
-                       break;
-               }
-               vector_push(header, curChar);
-       }
-       while (1);
-       char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char));
-       VectorIterator* it = vector_get_iterator(header);
-       int index = 0;
-       while (vectorI_has_data(it))
-       {
-               vectorI_get(it, headerString[index]);
-               vectorI_move_next(it);
-               index++;
-       }
-       vectorI_destroy(it);
-       headerString[index] = 0;
-       vector_destroy(header);
-       int position = 1, posID = 0, posPower = 0;
-       char* columnDescriptor = strtok(headerString, ",");
-       while (columnDescriptor != NULL)
-       {
-               if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id"))
-                       posID = position;
-               else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE"))
-                       posPower = position;
-               position++;
-               columnDescriptor = strtok(NULL, ",");
-       }
-       free(headerString);
-
-       // Estimate tsLength with a scan of the 3 first series
-       uint32_t ID=0, lastID=0, refTsLength=0;
-       float rawPower = 0.;
-       scan_line(ifile, posID, &ID, posPower, &rawPower);
-       //'sl' = sample lengths (short because a lot of comparisons then)
-       uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t));
-       for (int i=0; i<3; i++)
-       {
-               lastID = ID;
-               while (ID == lastID)
-               {
-                       sl[i]++;
-                       scan_line(ifile, posID, &ID, posPower, &rawPower);
-               }
-       }
-       if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2])
-               refTsLength = sl[2];
-       else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0])
-               refTsLength = sl[0];
-       else
-               refTsLength = sl[1];
-       free(sl);
-       //go back at the beginning of the first series (ready to read '\n'...)
-       fseek(ifile, headerShift-1, SEEK_SET);
+       //TODO: complete timedate vector from first_time and last_time
+       // --> this gives (expected) tsLength for free
 
+       FILE* ifile = fopen(ifileName, "r");
        // output file to write time-series sequentially, CSV format.
        FILE* ofile = fopen(ofileName, "w");
 
+       // Skip header
+       char nextChar;
+       do
+               nextChar = fgetc(ifile);
+       while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
+
        // process one client (ID in first column) at a time
        uint64_t processedLines = 0; //execution trace
        uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
@@ -242,3 +194,11 @@ void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
        fclose(ifile);
        fclose(ofile);
 }
+
+int main(char** argv, int argc)
+{
+       //TODO: args checks...
+       transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
+               argv[5], argv[6], argv[7], atoi(argv[8]));
+       return 0;
+}