data/preprocessing/convert_to_CSV.c

   1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
   2 #include <inttypes.h>
   3 #include <cgds/Vector.h>
   4 #include <string.h>
   5 #include <math.h>
   6 #include <float.h>
   7 #include <stdio.h>
   8
   9 // Read an integer char by char, and position the cursor to next character
  10 char readInt(FILE* stream, int* integer)
  11 {
  12     *integer = 0;
  13     char curChar = fgetc(stream);
  14     int sign = (curChar == '-' ? -1 : 1);
  15     while (curChar < '0' || curChar > '9')
  16         curChar = fgetc(stream);
  17     while (curChar >= '0' && curChar <= '9')
  18     {
  19         *integer = 10 * (*integer) + (int) (curChar - '0');
  20         curChar = fgetc(stream);
  21     }
  22     (*integer) *= sign;
  23     return curChar; //separator, endline or .,e,E (if inside readReal)
  24 }
  25
  26 // Read a real number char by char, and position the cursor to next character
  27 char readReal(FILE* stream, float* real)
  28 {
  29     int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
  30     char curChar = readInt(stream, &integerPart);
  31     if (curChar == '.')
  32     {
  33         //need to count zeros
  34         while ((curChar = fgetc(stream)) == '0')
  35             countZeros++;
  36         if (curChar >= '1' && curChar <= '9')
  37         {
  38             ungetc(curChar, stream);
  39             curChar = readInt(stream, &fractionalPart);
  40         }
  41     }
  42     if (curChar == 'e' || curChar == 'E')
  43         curChar = readInt(stream, &exponent);
  44     *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
  45         / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
  46             * pow(10,exponent);
  47     return curChar; //separator or endline
  48 }
  49
  50 // Parse a line into integer+float (ID, value)
  51 static void scan_line(FILE* ifile, char sep,
  52     int posID, int* ID, int posValue, float* value)
  53 {
  54     char curChar;
  55     int position = 1;
  56     while (1)
  57     {
  58         if (position == posID)
  59             curChar = readInt(ifile, ID);
  60         else if (position == posValue)
  61             curChar = readReal(ifile, value);
  62         else
  63             curChar = fgetc(ifile); //erase the comma (and skip field then)
  64
  65         // Continue until next separator (or line end or file end)
  66         while (!feof(ifile) && curChar != '\n' && curChar != sep)
  67             curChar = fgetc(ifile);
  68         position++;
  69
  70         if (curChar == '\n' || feof(ifile))
  71         {
  72             // Reached end of line
  73             break;
  74         }
  75     }
  76 }
  77
  78 // Main job: parse a data file into a conventional CSV file in rows, without header
  79 // Current limitations:
  80 //  - remove partial series (we could fill missing values instead)
  81 //  - consider missing fields == 0 (if ,,)
  82 //  - IDs should be st. pos. integers
  83 //  - UNIX linebreaks only (\n)
  84 int transform(const char* ifileName, int posID, int posValue,
  85     const char* ofileName, int nbItems, char sep)
  86 {
  87     uint64_t processedLines = 0; //execution trace
  88     uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
  89     int tsLength, lastID=0, ID, firstID, eof;
  90     float value, tmpVal;
  91     Vector* tsBuffer = vector_new(float);
  92     FILE* ifile = fopen(ifileName, "r");
  93     // Output file to write time-series sequentially, CSV format.
  94     FILE* ofile = fopen(ofileName, "w");
  95
  96     // Skip header
  97     char curChar;
  98     do
  99         curChar = fgetc(ifile);
 100     while (curChar != '\n');
 101
 102     // Process one client (ID in first column) at a time
 103     while (1)
 104     {
 105
 106         eof = feof(ifile);
 107         if (!eof)
 108         {
 109             // Is there anything left to read? (file may end with '\n')
 110             curChar = fgetc(ifile);
 111             if (!feof(ifile) && curChar != '\n')
 112             {
 113                 // Yes: read current line
 114                 ungetc(curChar, ifile);
 115                 scan_line(ifile, sep, posID, &ID, posValue, &value);
 116             }
 117             else
 118                 eof = 1;
 119         }
 120
 121         if (ID != lastID || eof)
 122         {
 123             if (lastID > 0)
 124             {
 125                 // Just starting a new time-series (or EOF): process the last one
 126                 if (tsLength == vector_size(tsBuffer))
 127                 {
 128                     for (int i=0; i<tsLength-1; i++)
 129                     {
 130                         vector_get(tsBuffer, i, tmpVal);
 131                         fprintf(ofile, "%g%c", tmpVal, sep);
 132                     }
 133                     vector_get(tsBuffer, tsLength-1, tmpVal);
 134                     fprintf(ofile, "%g\n", tmpVal);
 135                     seriesCount++;
 136                     if (nbItems > 0 && ++seriesCount >= nbItems)
 137                         break;
 138                 }
 139                 else
 140                 {
 141                     // Mismatch lengths: skip series
 142                     mismatchLengthCount++;
 143                 }
 144             }
 145             else
 146                 firstID = ID;
 147             if (eof)
 148             {
 149                 // Last serie is processed
 150                 break;
 151             }
 152             // Reinitialize current index of new serie
 153             tsLength = 0;
 154             lastID = ID;
 155         }
 156
 157         // Fill values buffer
 158         if (ID != firstID)
 159         {
 160             if (tsLength < vector_size(tsBuffer))
 161                 vector_set(tsBuffer, tsLength, value);
 162         }
 163         else
 164         {
 165             // First serie is reference: push all values
 166             vector_push(tsBuffer, value);
 167         }
 168         tsLength++;
 169
 170         if ((++processedLines) % 1000000 == 0)
 171             fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
 172     }
 173
 174     // finally print some statistics
 175     fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
 176     if (mismatchLengthCount > 0)
 177         fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
 178
 179     fclose(ifile);
 180     fclose(ofile);
 181     return 0;
 182 }
 183
 184 int main(int argc, char** argv)
 185 {
 186     if (argc < 4) //program name + 3 arguments
 187     {
 188         printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
 189   - ifileName: name of by-columns CSV input file\n \
 190   - posID: position of the identifier in a line (start at 1)\n \
 191   - posValue: position of the value of interest in a line\n \
 192   - ofileName: name of the output file; default: out.csv\n \
 193   - nbItems: number of series to retrieve; default: 0 (all)\n \
 194   - sep: fields separator; default: ','\n");
 195         return 0;
 196     }
 197     else
 198     {
 199         return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
 200             argc > 4 ? argv[4] : "out.csv",
 201             argc > 5 ? atoi(argv[5]) : 0,
 202             argc > 6 ? argv[6][0] : ',');
 203     }
 204 }