fix convert_to_CSV.c
authorBenjamin Auder <benjamin.auder@somewhere>
Sat, 18 Mar 2017 07:13:55 +0000 (08:13 +0100)
committerBenjamin Auder <benjamin.auder@somewhere>
Sat, 18 Mar 2017 07:13:55 +0000 (08:13 +0100)
data/preprocessing/convert_to_CSV.c
data/preprocessing/expected_output_test.csv

index df31f71..646c440 100644 (file)
 char readInt(FILE* stream, int* integer)
 {
        *integer = 0;
-       char nextChar = fgetc(stream);
-       int sign = (nextChar == '-' ? -1 : 1);
-       while (nextChar < '0' || nextChar > '9')
-               nextChar = fgetc(stream);
-       ungetc(nextChar, stream);
-       while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
+       char curChar = fgetc(stream);
+       int sign = (curChar == '-' ? -1 : 1);
+       while (curChar < '0' || curChar > '9')
+               curChar = fgetc(stream);
+       while (curChar >= '0' && curChar <= '9')
        {
-
-       printf("next char: %c\n",nextChar);
-
-               *integer = 10 * (*integer) + (int) (nextChar - '0'); }
+               *integer = 10 * (*integer) + (int) (curChar - '0');
+               curChar = fgetc(stream);
+       }
        (*integer) *= sign;
-printf("INTEGER: %i\n",*integer);
-       return nextChar;
+       return curChar; //separator, endline or .,e,E (if inside readReal)
 }
 
 // Read a real number char by char, and position the cursor to next character
 char readReal(FILE* stream, float* real)
 {
-       int integerPart;
-       char nextChar = readInt(stream, &integerPart);
-       int fractionalPart = 0;
-       int countZeros = 0;
-       if (nextChar == '.')
+       int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
+       char curChar = readInt(stream, &integerPart);
+       if (curChar == '.')
        {
                //need to count zeros
-               while ((nextChar = fgetc(stream)) == '0')
+               while ((curChar = fgetc(stream)) == '0')
                        countZeros++;
-               if (nextChar >= '1' && nextChar <= '9')
+               if (curChar >= '1' && curChar <= '9')
                {
-                       ungetc(nextChar, stream);
-                       nextChar = readInt(stream, &fractionalPart);
+                       ungetc(curChar, stream);
+                       curChar = readInt(stream, &fractionalPart);
                }
        }
-       int exponent = 0;
-       if (nextChar == 'e' || nextChar == 'E')
-               nextChar = readInt(stream, &exponent);
+       if (curChar == 'e' || curChar == 'E')
+               curChar = readInt(stream, &exponent);
        *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
                / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
                        * pow(10,exponent);
-       return nextChar;
+       return curChar; //separator or endline
 }
 
 // Parse a line into integer+float (ID, value)
 static void scan_line(FILE* ifile, char sep,
        int posID, int* ID, int posValue, float* value)
 {
-       char nextChar;
+       char curChar;
        int position = 1;
        while (1)
        {
                if (position == posID)
-                       nextChar = readInt(ifile, ID);
+                       curChar = readInt(ifile, ID);
                else if (position == posValue)
-                       nextChar = readReal(ifile, value);
+                       curChar = readReal(ifile, value);
                else
-                       nextChar = fgetc(ifile); //erase the comma (and skip field then)
+                       curChar = fgetc(ifile); //erase the comma (and skip field then)
 
                // Continue until next separator (or line end or file end)
-               while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
-                       nextChar = fgetc(ifile);
+               while (!feof(ifile) && curChar != '\n' && curChar != sep)
+                       curChar = fgetc(ifile);
                position++;
 
-               if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
+               if (curChar == '\n' || feof(ifile))
                {
-                       // Skip all potential line feeds
-                       while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
-                               nextChar = fgetc(ifile);
-                       if (!feof(ifile))
-                               ungetc(nextChar, ifile);
+                       // Reached end of line
                        break;
                }
        }
@@ -88,11 +78,17 @@ static void scan_line(FILE* ifile, char sep,
 // Main job: parse a data file into a conventional CSV file in rows, without header
 // Current limitations:
 //  - remove partial series (we could fill missing values instead)
-//  - consider missing fields == 0
-//  - IDs should be integers
+//  - consider missing fields == 0 (if ,,)
+//  - IDs should be st. pos. integers
+//  - UNIX linebreaks only (\n)
 int transform(const char* ifileName, int posID, int posValue,
        const char* ofileName, int nbItems, char sep)
 {
+       uint64_t processedLines = 0; //execution trace
+       uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
+       int tsLength, lastID=0, ID, firstID, eof;
+       float value, tmpVal;
+       Vector* tsBuffer = vector_new(float);
        FILE* ifile = fopen(ifileName, "r");
        // Output file to write time-series sequentially, CSV format.
        FILE* ofile = fopen(ofileName, "w");
@@ -101,77 +97,78 @@ int transform(const char* ifileName, int posID, int posValue,
        char curChar;
        do
                curChar = fgetc(ifile);
-       while (!feof(ifile) && curChar != '\n' && curChar != '\r');
+       while (curChar != '\n');
 
        // Process one client (ID in first column) at a time
-       uint64_t processedLines = 0; //execution trace
-       uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
-       int tsLength=0, refTsLength=0, lastID=0, ID=0;
-       float value=0., tsBuffer[refTsLength];
-       while (!feof(ifile))
+       while (1)
        {
-               // Go to next line
-               while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
-                       curChar = fgetc(ifile);
-               if (feof(ifile))
-                       break;
-               ungetc(curChar, ifile);
 
-               // Read current line
-               scan_line(ifile, sep, posID, &ID, posValue, &value);
+               eof = feof(ifile);
+               if (!eof)
+               {
+                       // Is there anything left to read? (file may end with '\n')
+                       curChar = fgetc(ifile);
+                       if (!feof(ifile) && curChar != '\n')
+                       {
+                               // Yes: read current line
+                               ungetc(curChar, ifile);
+                               scan_line(ifile, sep, posID, &ID, posValue, &value);
+                       }
+                       else
+                               eof = 1;
+               }
 
-               printf("SCAN: %i %g\n",ID,value);
-               if (ID != lastID)
+               if (ID != lastID || eof)
                {
-                       // Just starting a new time-series: must process the last one (if exists !)
                        if (lastID > 0)
                        {
-                               if (tsLength == refTsLength)
+                               // Just starting a new time-series (or EOF): process the last one
+                               if (tsLength == vector_size(tsBuffer))
                                {
-                                       for (int i=0; i<tsLength; i++)
+                                       for (int i=0; i<tsLength-1; i++)
                                        {
-                                               char* format = i<tsLength-1 ? "%g%c" : "%g";
-                                               fprintf(ofile, format, tsBuffer[i], sep);
+                                               vector_get(tsBuffer, i, tmpVal);
+                                               fprintf(ofile, "%g%c", tmpVal, sep);
                                        }
-                                       fprintf(ofile, "\n");
+                                       vector_get(tsBuffer, tsLength-1, tmpVal);
+                                       fprintf(ofile, "%g\n", tmpVal);
+                                       seriesCount++;
                                        if (nbItems > 0 && ++seriesCount >= nbItems)
                                                break;
                                }
-                               // Mismatch lengths: skip series
                                else
+                               {
+                                       // Mismatch lengths: skip series
                                        mismatchLengthCount++;
+                               }
                        }
                        else
-                               refTsLength = tsLength; //first serie is considered clean
-
-                       // reinitialize flags
+                               firstID = ID;
+                       if (eof)
+                       {
+                               // Last serie is processed
+                               break;
+                       }
+                       // Reinitialize current index of new serie
                        tsLength = 0;
                        lastID = ID;
                }
-printf("LA %i %i\n",tsLength,refTsLength);
-               //We cannot write more than refTsLength values
-               if (tsLength < refTsLength)
-                       tsBuffer[tsLength++] = value;
-
-               if ((++processedLines) % 1000000 == 0)
-                       fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
-       }
 
-       if (nbItems <= 0 || seriesCount < nbItems)
-       {
-               // flush last time-series if all conditions are met
-               if (tsLength == refTsLength)
+               // Fill values buffer
+               if (ID != firstID)
                {
-                       for (int i=0; i<tsLength; i++)
-                       {
-                               char* format = i<tsLength-1 ? "%g%c" : "%g";
-                               fprintf(ofile, format, tsBuffer[i], sep);
-                       }
-                       fprintf(ofile, "\n");
-                       seriesCount++;
+                       if (tsLength < vector_size(tsBuffer))
+                               vector_set(tsBuffer, tsLength, value);
                }
                else
-                       mismatchLengthCount++;
+               {
+                       // First serie is reference: push all values
+                       vector_push(tsBuffer, value);
+               }
+               tsLength++;
+
+               if ((++processedLines) % 1000000 == 0)
+                       fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
        }
 
        // finally print some statistics
index 4cd40d7..f837a25 100644 (file)
@@ -1,3 +1,3 @@
-1.05,2.,3.
-1e4,1.,0.
-3.25e2,-2.0e3,15.55
+1.05,2,3
+10000,1,0
+325,-2000,15.55