From: Benjamin Auder <benjamin.auder@somewhere>
Date: Mon, 6 Mar 2017 11:50:02 +0000 (+0100)
Subject: prepare converter for DB extracts datasets
X-Git-Url: https://git.auder.net/variants/img/pieces/doc/doc/html/up.jpg?a=commitdiff_plain;h=86223e279a954d946ae641888f5107ed9feb6217;p=epclust.git

prepare converter for DB extracts datasets
---

diff --git a/.gitignore b/.gitignore
index dbcc2f0..8db5c77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 /data/*
 !/data/README
 !/data/preprocessing/
+/data/prrprocessing/*
+!/data/preprocessing/convert.c
+!/data/preprocessing/Makefile
 
 #files generated by initialize.sh
 /.gitfat
diff --git a/data/preprocessing/Makefile b/data/preprocessing/Makefile
new file mode 100644
index 0000000..9a4a044
--- /dev/null
+++ b/data/preprocessing/Makefile
@@ -0,0 +1,2 @@
+#TODO: depend on cgds...
+gcc -o convert_to_CSV convert_to_CSV.c -lm
diff --git a/data/preprocessing/convert_32kEDF.R b/data/preprocessing/convert_32kEDF.R
deleted file mode 100644
index 2e6798a..0000000
--- a/data/preprocessing/convert_32kEDF.R
+++ /dev/null
@@ -1,24 +0,0 @@
-convert_32kEDF = function(orig_csv, nb_series_per_chunk)
-{
-	datetimes = #...TODO: all 3 years? year-by-year is better
-	orig_con = file(orig_csv, open="r") #2009, 2010 or 2011
-	ignored = readLines(orig_con, 1) #skip header
-	serie_length = length(datetimes) #around 365*24*2 = 17520
-	sep = if (year==2009) "," else if (year==2010) ";" else ";"
-
-scan(orig_con, character(), sep=",", nlines=1, quiet=TRUE)
-	library(sqldf, quietly=TRUE)
-	ids = read.csv.sql(file_csv, header = TRUE, sep = ","
-		sql = "select * from file_csv group by FK_CCU_ID")
-	index = 0
-	repeat
-	{
-		if (index+1 >= length(ids))
-			break
-		request = "select CPP_DATE_PUISSANCE,CPP_PUISSANCE_BRUTE where FK_CCU_ID in ("
-		for (id in ids[index + seq_len(nb_series_per_chunk)])
-			request = paste(request, id, ",", sep="")
-		request = paste(request, ") order by FK_CCU_ID,CPP_DATE_PUISSANCE", sep="")
-		series_chunk = read.csv.sql(file_csv, header = TRUE, sep = ",", sql = request)
-		
-		index = index + 17520
diff --git a/data/preprocessing/serialize.c b/data/preprocessing/convert_to_CSV.c
similarity index 66%
rename from data/preprocessing/serialize.c
rename to data/preprocessing/convert_to_CSV.c
index f35da64..a29b7d8 100644
--- a/data/preprocessing/serialize.c
+++ b/data/preprocessing/convert_to_CSV.c
@@ -46,13 +46,18 @@ char readReal(FILE* stream, float* real)
 }
 
 // Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float* rawPower)
+static void scan_line(FILE* ifile,
+	int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
 {
 	char nextChar;
 	int position = 1;
 	while (1)
 	{
-		if (position == posID)
+		if (position == posTime)
+		{
+			//TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
+		}
+		else if (position == posID)
 		{
 			int64_t ID_on64bits;
 			nextChar = readInt(ifile, &ID_on64bits);
@@ -85,82 +90,29 @@ static void scan_line(FILE* ifile, int posID, uint32_t* ID, int posPower, float*
 	}
 }
 
-// Main job: parse an "EDF-text file" into a conventional CSV file in rows, without header
-void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
+
+//TODO: check datetime at each line (build datetimes file ! for each year ?)
+//also fill NA with closest value in file (easy)
+//01JAN2009:00:00:00 ..........
+
+
+// Main job: parse a data file into a conventional CSV file in rows, without header
+void transform(const char* ifileName, int posID, int posTime, int posValue,
+	char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
 {
-	// Use the header to know positions of ID and rawPower
-	FILE* ifile = fopen(ifileName, "r");
-	uint32_t headerShift = 0;
-	char curChar;
-	Vector* header = vector_new(char);
-	do
-	{
-		curChar = fgetc(ifile);
-		headerShift++;
-		if (curChar == '\n' || curChar == '\r')
-		{
-			// Flush all potential other line feeds
-			while (curChar == '\n' || curChar == '\r')
-				curChar = fgetc(ifile);
-			ungetc(curChar, ifile);
-			break;
-		}
-		vector_push(header, curChar);
-	}
-	while (1);
-	char* headerString = (char*)malloc((vector_size(header) + 1)*sizeof(char));
-	VectorIterator* it = vector_get_iterator(header);
-	int index = 0;
-	while (vectorI_has_data(it))
-	{
-		vectorI_get(it, headerString[index]);
-		vectorI_move_next(it);
-		index++;
-	}
-	vectorI_destroy(it);
-	headerString[index] = 0;
-	vector_destroy(header);
-	int position = 1, posID = 0, posPower = 0;
-	char* columnDescriptor = strtok(headerString, ",");
-	while (columnDescriptor != NULL)
-	{
-		if (!strcmp(columnDescriptor,"FK_CCU_ID") || !strcmp(columnDescriptor,"fk_ccu_id"))
-			posID = position;
-		else if (!strcmp(columnDescriptor,"CPP_PUISSANCE_BRUTE"))
-			posPower = position;
-		position++;
-		columnDescriptor = strtok(NULL, ",");
-	}
-	free(headerString);
-
-	// Estimate tsLength with a scan of the 3 first series
-	uint32_t ID=0, lastID=0, refTsLength=0;
-	float rawPower = 0.;
-	scan_line(ifile, posID, &ID, posPower, &rawPower);
-	//'sl' = sample lengths (short because a lot of comparisons then)
-	uint32_t* sl = (uint32_t*) calloc(3, sizeof(uint32_t));
-	for (int i=0; i<3; i++)
-	{
-		lastID = ID;
-		while (ID == lastID)
-		{
-			sl[i]++;
-			scan_line(ifile, posID, &ID, posPower, &rawPower);
-		}
-	}
-	if (sl[0] <= sl[1] <= sl[2] || sl[1] <= sl[0] <= sl[2])
-		refTsLength = sl[2];
-	else if (sl[1] <= sl[2] <= sl[0] || sl[2] <= sl[1] <= sl[0])
-		refTsLength = sl[0];
-	else
-		refTsLength = sl[1];
-	free(sl);
-	//go back at the beginning of the first series (ready to read '\n'...)
-	fseek(ifile, headerShift-1, SEEK_SET);
+	//TODO: complete timedate vector from first_time and last_time
+	// --> this gives (expected) tsLength for free
 
+	FILE* ifile = fopen(ifileName, "r");
 	// output file to write time-series sequentially, CSV format.
 	FILE* ofile = fopen(ofileName, "w");
 
+	// Skip header
+	char nextChar;
+	do
+		nextChar = fgetc(ifile);
+	while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
+
 	// process one client (ID in first column) at a time
 	uint64_t processedLines = 0; //execution trace
 	uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
@@ -242,3 +194,11 @@ void transform(const char* ifileName, const char* ofileName, uint32_t nbItems)
 	fclose(ifile);
 	fclose(ofile);
 }
+
+int main(char** argv, int argc)
+{
+	//TODO: args checks...
+	transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
+		argv[5], argv[6], argv[7], atoi(argv[8]));
+	return 0;
+}