From a2fd2d76599672ac6396a0da1ae72007705044cb Mon Sep 17 00:00:00 2001
From: Benjamin Auder <benjamin.auder@somewhere>
Date: Sat, 18 Mar 2017 08:13:55 +0100
Subject: [PATCH] fix convert_to_CSV.c

---
 data/preprocessing/convert_to_CSV.c         | 167 ++++++++++----------
 data/preprocessing/expected_output_test.csv |   6 +-
 2 files changed, 85 insertions(+), 88 deletions(-)

diff --git a/data/preprocessing/convert_to_CSV.c b/data/preprocessing/convert_to_CSV.c
index df31f71..646c440 100644
--- a/data/preprocessing/convert_to_CSV.c
+++ b/data/preprocessing/convert_to_CSV.c
@@ -10,76 +10,66 @@
 char readInt(FILE* stream, int* integer)
 {
 	*integer = 0;
-	char nextChar = fgetc(stream);
-	int sign = (nextChar == '-' ? -1 : 1);
-	while (nextChar < '0' || nextChar > '9')
-		nextChar = fgetc(stream);
-	ungetc(nextChar, stream);
-	while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
+	char curChar = fgetc(stream);
+	int sign = (curChar == '-' ? -1 : 1);
+	while (curChar < '0' || curChar > '9')
+		curChar = fgetc(stream);
+	while (curChar >= '0' && curChar <= '9')
 	{
-
-	printf("next char: %c\n",nextChar);
-
-		*integer = 10 * (*integer) + (int) (nextChar - '0'); }
+		*integer = 10 * (*integer) + (int) (curChar - '0');
+		curChar = fgetc(stream);
+	}
 	(*integer) *= sign;
-printf("INTEGER: %i\n",*integer);
-	return nextChar;
+	return curChar; //separator, endline or .,e,E (if inside readReal)
 }
 
 // Read a real number char by char, and position the cursor to next character
 char readReal(FILE* stream, float* real)
 {
-	int integerPart;
-	char nextChar = readInt(stream, &integerPart);
-	int fractionalPart = 0;
-	int countZeros = 0;
-	if (nextChar == '.')
+	int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
+	char curChar = readInt(stream, &integerPart);
+	if (curChar == '.')
 	{
 		//need to count zeros
-		while ((nextChar = fgetc(stream)) == '0')
+		while ((curChar = fgetc(stream)) == '0')
 			countZeros++;
-		if (nextChar >= '1' && nextChar <= '9')
+		if (curChar >= '1' && curChar <= '9')
 		{
-			ungetc(nextChar, stream);
-			nextChar = readInt(stream, &fractionalPart);
+			ungetc(curChar, stream);
+			curChar = readInt(stream, &fractionalPart);
 		}
 	}
-	int exponent = 0;
-	if (nextChar == 'e' || nextChar == 'E')
-		nextChar = readInt(stream, &exponent);
+	if (curChar == 'e' || curChar == 'E')
+		curChar = readInt(stream, &exponent);
 	*real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
 		/ pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
 			* pow(10,exponent);
-	return nextChar;
+	return curChar; //separator or endline
 }
 
 // Parse a line into integer+float (ID, value)
 static void scan_line(FILE* ifile, char sep,
 	int posID, int* ID, int posValue, float* value)
 {
-	char nextChar;
+	char curChar;
 	int position = 1;
 	while (1)
 	{
 		if (position == posID)
-			nextChar = readInt(ifile, ID);
+			curChar = readInt(ifile, ID);
 		else if (position == posValue)
-			nextChar = readReal(ifile, value);
+			curChar = readReal(ifile, value);
 		else
-			nextChar = fgetc(ifile); //erase the comma (and skip field then)
+			curChar = fgetc(ifile); //erase the comma (and skip field then)
 
 		// Continue until next separator (or line end or file end)
-		while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
-			nextChar = fgetc(ifile);
+		while (!feof(ifile) && curChar != '\n' && curChar != sep)
+			curChar = fgetc(ifile);
 		position++;
 
-		if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
+		if (curChar == '\n' || feof(ifile))
 		{
-			// Skip all potential line feeds
-			while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
-				nextChar = fgetc(ifile);
-			if (!feof(ifile))
-				ungetc(nextChar, ifile);
+			// Reached end of line
 			break;
 		}
 	}
@@ -88,11 +78,17 @@ static void scan_line(FILE* ifile, char sep,
 // Main job: parse a data file into a conventional CSV file in rows, without header
 // Current limitations:
 //  - remove partial series (we could fill missing values instead)
-//  - consider missing fields == 0
-//  - IDs should be integers
+//  - consider missing fields == 0 (if ,,)
+//  - IDs should be st. pos. integers
+//  - UNIX linebreaks only (\n)
 int transform(const char* ifileName, int posID, int posValue,
 	const char* ofileName, int nbItems, char sep)
 {
+	uint64_t processedLines = 0; //execution trace
+	uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
+	int tsLength, lastID=0, ID, firstID, eof;
+	float value, tmpVal;
+	Vector* tsBuffer = vector_new(float);
 	FILE* ifile = fopen(ifileName, "r");
 	// Output file to write time-series sequentially, CSV format.
 	FILE* ofile = fopen(ofileName, "w");
@@ -101,77 +97,78 @@ int transform(const char* ifileName, int posID, int posValue,
 	char curChar;
 	do
 		curChar = fgetc(ifile);
-	while (!feof(ifile) && curChar != '\n' && curChar != '\r');
+	while (curChar != '\n');
 
 	// Process one client (ID in first column) at a time
-	uint64_t processedLines = 0; //execution trace
-	uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
-	int tsLength=0, refTsLength=0, lastID=0, ID=0;
-	float value=0., tsBuffer[refTsLength];
-	while (!feof(ifile))
+	while (1)
 	{
-		// Go to next line
-		while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
-			curChar = fgetc(ifile);
-		if (feof(ifile))
-			break;
-		ungetc(curChar, ifile);
 
-		// Read current line
-		scan_line(ifile, sep, posID, &ID, posValue, &value);
+		eof = feof(ifile);
+		if (!eof)
+		{
+			// Is there anything left to read? (file may end with '\n')
+			curChar = fgetc(ifile);
+			if (!feof(ifile) && curChar != '\n')
+			{
+				// Yes: read current line
+				ungetc(curChar, ifile);
+				scan_line(ifile, sep, posID, &ID, posValue, &value);
+			}
+			else
+				eof = 1;
+		}
 
-		printf("SCAN: %i %g\n",ID,value);
-		if (ID != lastID)
+		if (ID != lastID || eof)
 		{
-			// Just starting a new time-series: must process the last one (if exists !)
 			if (lastID > 0)
 			{
-				if (tsLength == refTsLength)
+				// Just starting a new time-series (or EOF): process the last one
+				if (tsLength == vector_size(tsBuffer))
 				{
-					for (int i=0; i<tsLength; i++)
+					for (int i=0; i<tsLength-1; i++)
 					{
-						char* format = i<tsLength-1 ? "%g%c" : "%g";
-						fprintf(ofile, format, tsBuffer[i], sep);
+						vector_get(tsBuffer, i, tmpVal);
+						fprintf(ofile, "%g%c", tmpVal, sep);
 					}
-					fprintf(ofile, "\n");
+					vector_get(tsBuffer, tsLength-1, tmpVal);
+					fprintf(ofile, "%g\n", tmpVal);
+					seriesCount++;
 					if (nbItems > 0 && ++seriesCount >= nbItems)
 						break;
 				}
-				// Mismatch lengths: skip series
 				else
+				{
+					// Mismatch lengths: skip series
 					mismatchLengthCount++;
+				}
 			}
 			else
-				refTsLength = tsLength; //first serie is considered clean
-
-			// reinitialize flags
+				firstID = ID;
+			if (eof)
+			{
+				// Last serie is processed
+				break;
+			}
+			// Reinitialize current index of new serie
 			tsLength = 0;
 			lastID = ID;
 		}
-printf("LA %i %i\n",tsLength,refTsLength);
-		//We cannot write more than refTsLength values
-		if (tsLength < refTsLength)
-			tsBuffer[tsLength++] = value;
-
-		if ((++processedLines) % 1000000 == 0)
-			fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
-	}
 
-	if (nbItems <= 0 || seriesCount < nbItems)
-	{
-		// flush last time-series if all conditions are met
-		if (tsLength == refTsLength)
+		// Fill values buffer
+		if (ID != firstID)
 		{
-			for (int i=0; i<tsLength; i++)
-			{
-				char* format = i<tsLength-1 ? "%g%c" : "%g";
-				fprintf(ofile, format, tsBuffer[i], sep);
-			}
-			fprintf(ofile, "\n");
-			seriesCount++;
+			if (tsLength < vector_size(tsBuffer))
+				vector_set(tsBuffer, tsLength, value);
 		}
 		else
-			mismatchLengthCount++;
+		{
+			// First serie is reference: push all values
+			vector_push(tsBuffer, value);
+		}
+		tsLength++;
+
+		if ((++processedLines) % 1000000 == 0)
+			fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
 	}
 
 	// finally print some statistics
diff --git a/data/preprocessing/expected_output_test.csv b/data/preprocessing/expected_output_test.csv
index 4cd40d7..f837a25 100644
--- a/data/preprocessing/expected_output_test.csv
+++ b/data/preprocessing/expected_output_test.csv
@@ -1,3 +1,3 @@
-1.05,2.,3.
-1e4,1.,0.
-3.25e2,-2.0e3,15.55
+1.05,2,3
+10000,1,0
+325,-2000,15.55
-- 
2.44.0