From: Benjamin Auder Date: Mon, 6 Mar 2017 11:50:02 +0000 (+0100) Subject: prepare converter for DB extracts datasets X-Git-Url: https://git.auder.net/variants/%24%7Bvname%7D/%7B%7B%20asset%28%27mixstore/current/%7B%7B?a=commitdiff_plain;h=86223e279a954d946ae641888f5107ed9feb6217;p=epclust.git prepare converter for DB extracts datasets --- diff --git a/.gitignore b/.gitignore index dbcc2f0..8db5c77 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ /data/* !/data/README !/data/preprocessing/ +/data/prrprocessing/* +!/data/preprocessing/convert.c +!/data/preprocessing/Makefile #files generated by initialize.sh /.gitfat diff --git a/data/preprocessing/Makefile b/data/preprocessing/Makefile new file mode 100644 index 0000000..9a4a044 --- /dev/null +++ b/data/preprocessing/Makefile @@ -0,0 +1,2 @@ +#TODO: depend on cgds... +gcc -o convert_to_CSV convert_to_CSV.c -lm diff --git a/data/preprocessing/convert_32kEDF.R b/data/preprocessing/convert_32kEDF.R deleted file mode 100644 index 2e6798a..0000000 --- a/data/preprocessing/convert_32kEDF.R +++ /dev/null @@ -1,24 +0,0 @@ -convert_32kEDF = function(orig_csv, nb_series_per_chunk) -{ - datetimes = #...TODO: all 3 years? year-by-year is better - orig_con = file(orig_csv, open="r") #2009, 2010 or 2011 - ignored = readLines(orig_con, 1) #skip header - serie_length = length(datetimes) #around 365*24*2 = 17520 - sep = if (year==2009) "," else if (year==2010) ";" else ";" - -scan(orig_con, character(), sep=",", nlines=1, quiet=TRUE) - library(sqldf, quietly=TRUE) - ids = read.csv.sql(file_csv, header = TRUE, sep = "," - sql = "select * from file_csv group by FK_CCU_ID") - index = 0 - repeat - { - if (index+1 >= length(ids)) - break - request = "select CPP_DATE_PUISSANCE,CPP_PUISSANCE_BRUTE where FK_CCU_ID in (" - for (id in ids[index + seq_len(nb_series_per_chunk)]) - request = paste(request, id, ",", sep="") - request = paste(request, ") order by FK_CCU_ID,CPP_DATE_PUISSANCE", sep="") - series_chunk = read.csv.sql(file_csv, header = TRUE, sep = ",", sql = request) - - index = index + 17520 diff --git a/data/preprocessing/serialize.c b/data/preprocessing/convert_to_CSV.c similarity index 66% rename from data/preprocessing/serialize.c rename to data/preprocessing/convert_to_CSV.c index f35da64..a29b7d8 100644 --- a/data/preprocessing/serialize.c +++ b/data/preprocessing/convert_to_CSV.c @@ -46,13 +46,18 @@ char readReal(FILE* stream, float* real) } // Parse a line into integer+float (ID, raw power) -static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower) +static void scan_line(FILE* ifile, + int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) { char nextChar; int position = 1; while (1) { - if (position == posID) + if (position == posTime) + { + //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) + } + else if (position == posID) { int64_t ID_on64bits; nextChar = readInt(ifile, &ID_on64bits); @@ -85,82 +90,29 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* } } -// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header -void transform(const char* ifileName, const char* ofileName, uint32_t nbItems) + +//TODO: check datetime at each line (build datetimes file ! for each year ?) +//also fill NA with closest value in file (easy) +//01JAN2009:00:00:00 .......... + + +// Main job: parse a data file into a conventional CSV file in rows, without header +void transform(const char* ifileName, int posID, int posTime, int posValue, + char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems { - // Use the header to know positions of ID and rawPower - FILE* ifile = fopen(ifileName, "r"); - uint32_t headerShift = 0; - char curChar; - Vector* header = vector_new(char); - do - { - curChar = fgetc(ifile); - headerShift++; - if (curChar == '\n' || curChar == '\r') - { - // Flush all potential other line feeds - while (curChar == '\n' || curChar == '\r') - curChar = fgetc(ifile); - ungetc(curChar, ifile); - break; - } - vector_push(header, curChar); - } - while (1); - char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char)); - VectorIterator* it = vector_get_iterator(header); - int index = 0; - while (vectorI_has_data(it)) - { - vectorI_get(it, headerString[index]); - vectorI_move_next(it); - index++; - } - vectorI_destroy(it); - headerString[index] = 0; - vector_destroy(header); - int position = 1, posID = 0, posPower = 0; - char* columnDescriptor = strtok(headerString, ","); - while (columnDescriptor != NULL) - { - if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id")) - posID = position; - else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE")) - posPower = position; - position++; - columnDescriptor = strtok(NULL, ","); - } - free(headerString); - - // Estimate tsLength with a scan of the 3 first series - uint32_t ID=0, lastID=0, refTsLength=0; - float rawPower = 0.; - scan_line(ifile, posID, &ID, posPower, &rawPower); - //'sl' = sample lengths (short because a lot of comparisons then) - uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t)); - for (int i=0; i<3; i++) - { - lastID = ID; - while (ID == lastID) - { - sl[i]++; - scan_line(ifile, posID, &ID, posPower, &rawPower); - } - } - if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2]) - refTsLength = sl[2]; - else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0]) - refTsLength = sl[0]; - else - refTsLength = sl[1]; - free(sl); - //go back at the beginning of the first series (ready to read '\n'...) - fseek(ifile, headerShift-1, SEEK_SET); + //TODO: complete timedate vector from first_time and last_time + // --> this gives (expected) tsLength for free + FILE* ifile = fopen(ifileName, "r"); // output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); + // Skip header + char nextChar; + do + nextChar = fgetc(ifile); + while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') + // process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; @@ -242,3 +194,11 @@ void transform(const char* ifileName, const char* ofileName, uint32_t nbItems) fclose(ifile); fclose(ofile); } + +int main(char** argv, int argc) +{ + //TODO: args checks... + transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), + argv[5], argv[6], argv[7], atoi(argv[8])); + return 0; +}