X-Git-Url: https://git.auder.net/?p=epclust.git;a=blobdiff_plain;f=data%2Fpreprocessing%2Fconvert_to_CSV.c;fp=data%2Fpreprocessing%2Fserialize.c;h=a29b7d87e343e39164cbeedb9f19db0f52319d96;hp=f35da6490c204e0a57dc0de4a0f93cf39b87bd45;hb=86223e279a954d946ae641888f5107ed9feb6217;hpb=c133b1bd162091c6fc2baeea0c9f1d0c1f1369fb diff --git a/data/preprocessing/serialize.c b/data/preprocessing/convert_to_CSV.c similarity index 66% rename from data/preprocessing/serialize.c rename to data/preprocessing/convert_to_CSV.c index f35da64..a29b7d8 100644 --- a/data/preprocessing/serialize.c +++ b/data/preprocessing/convert_to_CSV.c @@ -46,13 +46,18 @@ char readReal(FILE* stream, float* real) } // Parse a line into integer+float (ID, raw power) -static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower) +static void scan_line(FILE* ifile, + int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) { char nextChar; int position = 1; while (1) { - if (position == posID) + if (position == posTime) + { + //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) + } + else if (position == posID) { int64_t ID_on64bits; nextChar = readInt(ifile, &ID_on64bits); @@ -85,82 +90,29 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* } } -// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header -void transform(const char* ifileName, const char* ofileName, uint32_t nbItems) + +//TODO: check datetime at each line (build datetimes file ! for each year ?) +//also fill NA with closest value in file (easy) +//01JAN2009:00:00:00 .......... + + +// Main job: parse a data file into a conventional CSV file in rows, without header +void transform(const char* ifileName, int posID, int posTime, int posValue, + char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems { - // Use the header to know positions of ID and rawPower - FILE* ifile = fopen(ifileName, "r"); - uint32_t headerShift = 0; - char curChar; - Vector* header = vector_new(char); - do - { - curChar = fgetc(ifile); - headerShift++; - if (curChar == '\n' || curChar == '\r') - { - // Flush all potential other line feeds - while (curChar == '\n' || curChar == '\r') - curChar = fgetc(ifile); - ungetc(curChar, ifile); - break; - } - vector_push(header, curChar); - } - while (1); - char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char)); - VectorIterator* it = vector_get_iterator(header); - int index = 0; - while (vectorI_has_data(it)) - { - vectorI_get(it, headerString[index]); - vectorI_move_next(it); - index++; - } - vectorI_destroy(it); - headerString[index] = 0; - vector_destroy(header); - int position = 1, posID = 0, posPower = 0; - char* columnDescriptor = strtok(headerString, ","); - while (columnDescriptor != NULL) - { - if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id")) - posID = position; - else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE")) - posPower = position; - position++; - columnDescriptor = strtok(NULL, ","); - } - free(headerString); - - // Estimate tsLength with a scan of the 3 first series - uint32_t ID=0, lastID=0, refTsLength=0; - float rawPower = 0.; - scan_line(ifile, posID, &ID, posPower, &rawPower); - //'sl' = sample lengths (short because a lot of comparisons then) - uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t)); - for (int i=0; i<3; i++) - { - lastID = ID; - while (ID == lastID) - { - sl[i]++; - scan_line(ifile, posID, &ID, posPower, &rawPower); - } - } - if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2]) - refTsLength = sl[2]; - else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0]) - refTsLength = sl[0]; - else - refTsLength = sl[1]; - free(sl); - //go back at the beginning of the first series (ready to read '\n'...) - fseek(ifile, headerShift-1, SEEK_SET); + //TODO: complete timedate vector from first_time and last_time + // --> this gives (expected) tsLength for free + FILE* ifile = fopen(ifileName, "r"); // output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); + // Skip header + char nextChar; + do + nextChar = fgetc(ifile); + while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') + // process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; @@ -242,3 +194,11 @@ void transform(const char* ifileName, const char* ofileName, uint32_t nbItems) fclose(ifile); fclose(ofile); } + +int main(char** argv, int argc) +{ + //TODO: args checks... + transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), + argv[5], argv[6], argv[7], atoi(argv[8])); + return 0; +}