From: Benjamin Auder <benjamin.auder@somewhere>
Date: Tue, 14 Mar 2017 14:59:40 +0000 (+0100)
Subject: work on CSV preprocessing
X-Git-Url: https://git.auder.net/js/img/current/pieces/cn.svg?a=commitdiff_plain;h=a0fa5bd0324ecd9bf92e9940e98344f7ee4b2509;p=epclust.git

work on CSV preprocessing
---

diff --git a/.gitignore b/.gitignore
index 255781c..96947f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,6 @@
 /data/*
 !/data/README
 !/data/preprocessing/
-/data/preprocessing/*
-!/data/preprocessing/convert.c
-!/data/preprocessing/Makefile
 
 #files generated by initialize.sh
 /.gitfat
@@ -15,7 +12,7 @@
 *.swp
 
 #ignore binary files generated by claws()
-*.bin
+*.epclust.bin
 
 #ignore R session files
 .Rhistory
@@ -33,9 +30,10 @@
 #ignore jupyter generated file (HTML vignette, and reports)
 *.ipynb.html
 
-#ignore object files
+#ignore object files and executables
 *.o
 *.so
+*.exe
 
 #ignore RcppExports, generated by Rcpp::compileAttributes
 /epclust/R/RcppExports.R
diff --git a/data/preprocessing/Makefile b/data/preprocessing/Makefile
index 9a4a044..097420e 100644
--- a/data/preprocessing/Makefile
+++ b/data/preprocessing/Makefile
@@ -1,2 +1,4 @@
-#TODO: depend on cgds...
-gcc -o convert_to_CSV convert_to_CSV.c -lm
+TARGET = transform.exe
+
+$(TARGET): convert_to_CSV.c
+	gcc convert_to_CSV.c -lm -lcgds -o $(TARGET)
diff --git a/data/preprocessing/README b/data/preprocessing/README
new file mode 100644
index 0000000..1a860df
--- /dev/null
+++ b/data/preprocessing/README
@@ -0,0 +1,4 @@
+Converter from raw by-columns CSV format to by-rows CSV file
+(much smaller), as epclust::claws() expect as CSV input.
+
+Dependency: cgds, https://git.auder.net/?p=cgds.git
diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c
index a29b7d8..34cb6e4 100644
--- a/data/preprocessing/convert_to_CSV.c
+++ b/data/preprocessing/convert_to_CSV.c
@@ -5,25 +5,27 @@
 #include <math.h>
 #include <float.h>
 
-char readInt(FILE* stream, int64_t* integer)
+// Read an integer char by char, and position the cursor to next character
+char readInt(FILE* stream, int* integer)
 {
 	*integer = 0;
-	char curChar = fgetc(stream);
-	int sign = (curChar == '-' ? -1 : 1);
-	while (curChar < '0' || curChar > '9')
-		curChar = fgetc(stream);
-	ungetc(curChar, stream);
-	while ((curChar = fgetc(stream)) >= '0' && curChar <= '9')
-		*integer = 10 * (*integer) + (int64_t) (curChar - '0');
+	char nextChar = fgetc(stream);
+	int sign = (nextChar == '-' ? -1 : 1);
+	while (nextChar < '0' || nextChar > '9')
+		nextChar = fgetc(stream);
+	ungetc(nextChar, stream);
+	while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
+		*integer = 10 * (*integer) + (int) (nextChar - '0');
 	(*integer) *= sign;
-	return curChar;
+	return nextChar;
 }
 
+// Read a real number char by char, and position the cursor to next character
 char readReal(FILE* stream, float* real)
 {
-	int64_t integerPart;
+	int integerPart;
 	char nextChar = readInt(stream, &integerPart);
-	int64_t fractionalPart = 0;
+	int fractionalPart = 0;
 	int countZeros = 0;
 	if (nextChar == '.')
 	{
@@ -36,7 +38,7 @@ char readReal(FILE* stream, float* real)
 			nextChar = readInt(stream, &fractionalPart);
 		}
 	}
-	int64_t exponent = 0;
+	int exponent = 0;
 	if (nextChar == 'e' || nextChar == 'E')
 		nextChar = readInt(stream, &exponent);
 	*real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
@@ -45,42 +47,29 @@ char readReal(FILE* stream, float* real)
 	return nextChar;
 }
 
-// Parse a line into integer+float (ID, raw power)
-static void scan_line(FILE* ifile,
-	int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower)
+// Parse a line into integer+float (ID, value)
+static void scan_line(FILE* ifile, char sep,
+	int posID, int* ID, int posValue, float* value)
 {
 	char nextChar;
 	int position = 1;
 	while (1)
 	{
-		if (position == posTime)
-		{
-			//TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...)
-		}
-		else if (position == posID)
-		{
-			int64_t ID_on64bits;
-			nextChar = readInt(ifile, &ID_on64bits);
-			*ID = (uint32_t)ID_on64bits;
-		}
-		else if (position == posPower)
-		{
-			float power = FLT_MAX; //"NA"
-			nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?!
-			*rawPower = (float) power;
-		}
+		if (position == posID)
+			nextChar = readInt(ifile, ID);
+		else if (position == posValue)
+			nextChar = readReal(ifile, value);
 		else
-			//erase the comma (and skip field then)
-			nextChar = fgetc(ifile);
+			nextChar = fgetc(ifile); //erase the comma (and skip field then)
 
-		//continue until next comma (or line end or file end)
-		while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',')
+		// Continue until next separator (or line end or file end)
+		while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
 			nextChar = fgetc(ifile);
 		position++;
 
 		if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
 		{
-			// skip all potential line feeds
+			// Skip all potential line feeds
 			while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
 				nextChar = fgetc(ifile);
 			if (!feof(ifile))
@@ -90,70 +79,61 @@ static void scan_line(FILE* ifile,
 	}
 }
 
-
-//TODO: check datetime at each line (build datetimes file ! for each year ?)
-//also fill NA with closest value in file (easy)
-//01JAN2009:00:00:00 ..........
-
-
 // Main job: parse a data file into a conventional CSV file in rows, without header
-void transform(const char* ifileName, int posID, int posTime, int posValue,
-	char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems
+// Current limitations:
+//  - remove partial series (we could fill missing values instead)
+//  - consider missing fields == 0
+//  - IDs should be integers
+int transform(const char* ifileName, int posID, int posValue,
+	const char* ofileName, int nbItems, char sep)
 {
-	//TODO: complete timedate vector from first_time and last_time
-	// --> this gives (expected) tsLength for free
-
 	FILE* ifile = fopen(ifileName, "r");
-	// output file to write time-series sequentially, CSV format.
+	// Output file to write time-series sequentially, CSV format.
 	FILE* ofile = fopen(ofileName, "w");
 
 	// Skip header
-	char nextChar;
+	char curChar;
 	do
-		nextChar = fgetc(ifile);
-	while (!feof(ifile) && nextChar != '\n' && nextChar != '\r')
+		curChar = fgetc(ifile);
+	while (!feof(ifile) && curChar != '\n' && curChar != '\r');
 
-	// process one client (ID in first column) at a time
+	// Process one client (ID in first column) at a time
 	uint64_t processedLines = 0; //execution trace
-	uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0;
-	uint32_t mismatchLengthCount=0;
-	float tsBuffer[refTsLength];
-	lastID = 0;
+	uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
+	int tsLength=0, refTsLength=0, lastID=0, ID=0;
+	float value=0., tsBuffer[refTsLength];
 	while (!feof(ifile))
 	{
-		// next element to read always start with a digit
-		do
+		// Go to next line
+		while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
 			curChar = fgetc(ifile);
-		while (!feof(ifile) && (curChar < '0' || curChar > '9'));
 		if (feof(ifile))
 			break;
 		ungetc(curChar, ifile);
 
-		// read line
-		scan_line(ifile, posID, &ID, posPower, &rawPower);
+		// Read current line
+		scan_line(ifile, sep, posID, &ID, posValue, &value);
 		if (ID != lastID)
 		{
-			//just starting a new time-series: must process the last one (if there is a last one !)
+			// Just starting a new time-series: must process the last one (if exists !)
 			if (lastID > 0)
 			{
+				if (refTsLength == 0)
+					refTsLength = tsLength; //first serie is considered clean
 				if (tsLength == refTsLength)
 				{
 					for (int i=0; i<tsLength; i++)
 					{
-						char* format = i<tsLength-1 ? "%g," : "%g";
-						fprintf(ofile, format, tsBuffer[i]);
+						char* format = i<tsLength-1 ? "%g%c" : "%g";
+						fprintf(ofile, format, tsBuffer[i], sep);
 					}
 					fprintf(ofile, "\n");
 					if (nbItems > 0 && ++seriesCount >= nbItems)
 						break;
 				}
-				//if something wrong happened, skip series
+				// Mismatch lengths: skip series
 				else
-				{
-					skippedSeriesCount++;
-					if (tsLength != refTsLength)
-						mismatchLengthCount++;
-				}
+					mismatchLengthCount++;
 			}
 
 			// reinitialize flags
@@ -163,42 +143,57 @@ void transform(const char* ifileName, int posID, int posTime, int posValue,
 
 		//We cannot write more than refTsLength values
 		if (tsLength < refTsLength)
-			tsBuffer[tsLength++] = rawPower;
+			tsBuffer[tsLength++] = value;
 
 		if ((++processedLines) % 1000000 == 0)
 			fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
 	}
 
-	if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems))
+	if (nbItems <= 0 || seriesCount < nbItems)
 	{
 		// flush last time-series if all conditions are met
-		for (int i=0; i<tsLength; i++)
+		if (tsLength == refTsLength)
 		{
-			char* format = i<tsLength-1 ? "%g," : "%g";
-			fprintf(ofile, format, tsBuffer[i]);
+			for (int i=0; i<tsLength; i++)
+			{
+				char* format = i<tsLength-1 ? "%g%c" : "%g";
+				fprintf(ofile, format, tsBuffer[i], sep);
+			}
+			fprintf(ofile, "\n");
+			seriesCount++;
 		}
-		fprintf(ofile, "\n");
-		seriesCount++;
-	}
-	else if (nbItems <= 0 || seriesCount < nbItems)
-	{
-		if (tsLength != refTsLength)
+		else
 			mismatchLengthCount++;
 	}
 
 	// finally print some statistics
-	if (seriesCount < nbItems)
-		fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount);
-	fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount);
+	fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
+	if (mismatchLengthCount > 0)
+		fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
 
 	fclose(ifile);
 	fclose(ofile);
+	return 0;
 }
 
-int main(char** argv, int argc)
+int main(int argc, char** argv)
 {
-	//TODO: args checks...
-	transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]),
-		argv[5], argv[6], argv[7], atoi(argv[8]));
-	return 0;
+	if (argc < 4) //program name + 3 arguments
+	{
+		printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
+  - ifileName: name of by-columns CSV input file\n \
+  - posID: position of the identifier in a line (start at 1)\n \
+  - posValue: position of the value of interest in a line\n \
+  - ofileName: name of the output file; default: out.csv\n \
+  - nbItems: number of series to retrieve; default: 0 (all)\n \
+  - sep: fields separator; default: ','\n");
+		return 0;
+	}
+	else
+	{
+		return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
+			argc > 4 ? argv[4] : "out.csv",
+			argc > 5 ? atoi(argv[5]) : 0,
+			argc > 6 ? argv[6][0] : ',');
+	}
 }
diff --git a/data/preprocessing/input_test.csv b/data/preprocessing/input_test.csv
new file mode 100644
index 0000000..e69de29
diff --git a/data/preprocessing/test_convert.c b/data/preprocessing/test_convert.c
new file mode 100644
index 0000000..20a5807
--- /dev/null
+++ b/data/preprocessing/test_convert.c
@@ -0,0 +1,5 @@
+int main(int argc, char** argv)
+{
+	execl("transform", "input_test.csv", 1, 3, "output_test.csv", ",");
+	//TODO: diff, .... http://stackoverflow.com/questions/7292642/grabbing-output-from-exec
+}