data/preprocessing/convert_to_CSV.c

   1 #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers
   2 #include <inttypes.h>
   3 #include <cgds/Vector.h>
   4 #include <string.h>
   5 #include <math.h>
   6 #include <float.h>
   7 #include <stdio.h>
   8
   9 // Read an integer char by char, and position the cursor to next character
  10 char readInt(FILE* stream, int* integer)
  11 {
  12     *integer = 0;
  13     char curChar = fgetc(stream);
  14     int sign = (curChar == '-' ? -1 : 1);
  15     while (curChar < '0' || curChar > '9')
  16         curChar = fgetc(stream);
  17     while (curChar >= '0' && curChar <= '9')
  18     {
  19         *integer = 10 * (*integer) + (int) (curChar - '0');
  20         curChar = fgetc(stream);
  21     }
  22     (*integer) *= sign;
  23     return curChar; //separator, endline or .,e,E (if inside readReal)
  24 }
  25
  26 // Read a real number char by char, and position the cursor to next character
  27 char readReal(FILE* stream, float* real)
  28 {
  29     int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
  30     char curChar = readInt(stream, &integerPart);
  31     if (curChar == '.')
  32     {
  33         //need to count zeros
  34         while ((curChar = fgetc(stream)) == '0')
  35             countZeros++;
  36         if (curChar >= '1' && curChar <= '9')
  37         {
  38             ungetc(curChar, stream);
  39             curChar = readInt(stream, &fractionalPart);
  40         }
  41     }
  42     if (curChar == 'e' || curChar == 'E')
  43         curChar = readInt(stream, &exponent);
  44     *real = ( integerPart + (integerPart>=0 ? 1. : -1.) * (float)fractionalPart
  45         / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
  46             * pow(10,exponent);
  47
  48     return curChar; //separator or endline
  49 }
  50
  51 // Parse a line into integer+float (ID, value)
  52 static void scan_line(FILE* ifile, char sep,
  53     int posID, int* ID, int posValue, float* value)
  54 {
  55     char curChar;
  56     int position = 1;
  57     while (1)
  58     {
  59         if (position == posID)
  60             curChar = readInt(ifile, ID);
  61         else if (position == posValue)
  62             curChar = readReal(ifile, value);
  63         else
  64             curChar = fgetc(ifile); //erase the comma (and skip field then)
  65
  66         // Continue until next separator (or line end or file end)
  67         while (!feof(ifile) && curChar != '\n' && curChar != sep)
  68             curChar = fgetc(ifile);
  69         position++;
  70
  71         if (curChar == '\n' || feof(ifile))
  72         {
  73             // Reached end of line
  74             break;
  75         }
  76     }
  77 }
  78
  79 // Main job: parse a data file into a conventional CSV file in rows, without header
  80 // Current limitations:
  81 //  - remove partial series (we could fill missing values instead)
  82 //  - consider missing fields == 0 (if ,,)
  83 //  - IDs should be st. pos. integers
  84 //  - UNIX linebreaks only (\n)
  85 int transform(const char* ifileName, int posID, int posValue,
  86     const char* ofileName, int nbItems, char sep)
  87 {
  88     uint64_t processedLines = 0; //execution trace
  89     uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
  90     int tsLength, lastID=0, ID, firstID, eof;
  91     float value, tmpVal;
  92     Vector* tsBuffer = vector_new(float);
  93     FILE* ifile = fopen(ifileName, "r");
  94     // Output file to write time-series sequentially, CSV format.
  95     FILE* ofile = fopen(ofileName, "w");
  96
  97     // Skip header
  98     char curChar;
  99     do
 100         curChar = fgetc(ifile);
 101     while (curChar != '\n');
 102
 103     // Process one client (ID in first column) at a time
 104     while (1)
 105     {
 106
 107         eof = feof(ifile);
 108         if (!eof)
 109         {
 110             // Is there anything left to read? (file may end with '\n')
 111             curChar = fgetc(ifile);
 112             if (!feof(ifile) && curChar != '\n')
 113             {
 114                 // Yes: read current line
 115                 ungetc(curChar, ifile);
 116                 scan_line(ifile, sep, posID, &ID, posValue, &value);
 117             }
 118             else
 119                 eof = 1;
 120         }
 121
 122         if (ID != lastID || eof)
 123         {
 124             if (lastID > 0)
 125             {
 126                 // Just starting a new time-series (or EOF): process the last one
 127                 if (tsLength == vector_size(tsBuffer))
 128                 {
 129                     for (int i=0; i<tsLength-1; i++)
 130                     {
 131                         vector_get(tsBuffer, i, tmpVal);
 132                         fprintf(ofile, "%g%c", tmpVal, sep);
 133                     }
 134                     vector_get(tsBuffer, tsLength-1, tmpVal);
 135                     fprintf(ofile, "%g\n", tmpVal);
 136                     seriesCount++;
 137                     if (nbItems > 0 && ++seriesCount >= nbItems)
 138                         break;
 139                 }
 140                 else
 141                 {
 142                     // Mismatch lengths: skip series
 143                     mismatchLengthCount++;
 144                 }
 145             }
 146             else
 147                 firstID = ID;
 148             if (eof)
 149             {
 150                 // Last serie is processed
 151                 break;
 152             }
 153             // Reinitialize current index of new serie
 154             tsLength = 0;
 155             lastID = ID;
 156         }
 157
 158         // Fill values buffer
 159         if (ID != firstID)
 160         {
 161             if (tsLength < vector_size(tsBuffer))
 162                 vector_set(tsBuffer, tsLength, value);
 163         }
 164         else
 165         {
 166             // First serie is reference: push all values
 167             vector_push(tsBuffer, value);
 168         }
 169         tsLength++;
 170
 171         if ((++processedLines) % 1000000 == 0)
 172             fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
 173     }
 174
 175     // finally print some statistics
 176     fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount);
 177     if (mismatchLengthCount > 0)
 178         fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount);
 179
 180     fclose(ifile);
 181     fclose(ofile);
 182     return 0;
 183 }
 184
 185 int main(int argc, char** argv)
 186 {
 187     if (argc < 4) //program name + 3 arguments
 188     {
 189         printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \
 190   - ifileName: name of by-columns CSV input file\n \
 191   - posID: position of the identifier in a line (start at 1)\n \
 192   - posValue: position of the value of interest in a line\n \
 193   - ofileName: name of the output file; default: out.csv\n \
 194   - nbItems: number of series to retrieve; default: 0 (all)\n \
 195   - sep: fields separator; default: ','\n");
 196         return 0;
 197     }
 198     else
 199     {
 200         return transform(argv[1], atoi(argv[2]), atoi(argv[3]),
 201             argc > 4 ? argv[4] : "out.csv",
 202             argc > 5 ? atoi(argv[5]) : 0,
 203             argc > 6 ? argv[6][0] : ',');
 204     }
 205 }