| 1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
| 2 | #include <inttypes.h> |
| 3 | #include <cgds/Vector.h> |
| 4 | #include <string.h> |
| 5 | #include <math.h> |
| 6 | #include <float.h> |
| 7 | #include <stdio.h> |
| 8 | |
| 9 | // Read an integer char by char, and position the cursor to next character |
| 10 | char readInt(FILE* stream, int* integer) |
| 11 | { |
| 12 | *integer = 0; |
| 13 | char curChar = fgetc(stream); |
| 14 | int sign = (curChar == '-' ? -1 : 1); |
| 15 | while (curChar < '0' || curChar > '9') |
| 16 | curChar = fgetc(stream); |
| 17 | while (curChar >= '0' && curChar <= '9') |
| 18 | { |
| 19 | *integer = 10 * (*integer) + (int) (curChar - '0'); |
| 20 | curChar = fgetc(stream); |
| 21 | } |
| 22 | (*integer) *= sign; |
| 23 | return curChar; //separator, endline or .,e,E (if inside readReal) |
| 24 | } |
| 25 | |
| 26 | // Read a real number char by char, and position the cursor to next character |
| 27 | char readReal(FILE* stream, float* real) |
| 28 | { |
| 29 | int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0; |
| 30 | char curChar = readInt(stream, &integerPart); |
| 31 | if (curChar == '.') |
| 32 | { |
| 33 | //need to count zeros |
| 34 | while ((curChar = fgetc(stream)) == '0') |
| 35 | countZeros++; |
| 36 | if (curChar >= '1' && curChar <= '9') |
| 37 | { |
| 38 | ungetc(curChar, stream); |
| 39 | curChar = readInt(stream, &fractionalPart); |
| 40 | } |
| 41 | } |
| 42 | if (curChar == 'e' || curChar == 'E') |
| 43 | curChar = readInt(stream, &exponent); |
| 44 | *real = ( integerPart + (integerPart>=0 ? 1. : -1.) * (float)fractionalPart |
| 45 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) |
| 46 | * pow(10,exponent); |
| 47 | |
| 48 | return curChar; //separator or endline |
| 49 | } |
| 50 | |
| 51 | // Parse a line into integer+float (ID, value) |
| 52 | static void scan_line(FILE* ifile, char sep, |
| 53 | int posID, int* ID, int posValue, float* value) |
| 54 | { |
| 55 | char curChar; |
| 56 | int position = 1; |
| 57 | while (1) |
| 58 | { |
| 59 | if (position == posID) |
| 60 | curChar = readInt(ifile, ID); |
| 61 | else if (position == posValue) |
| 62 | curChar = readReal(ifile, value); |
| 63 | else |
| 64 | curChar = fgetc(ifile); //erase the comma (and skip field then) |
| 65 | |
| 66 | // Continue until next separator (or line end or file end) |
| 67 | while (!feof(ifile) && curChar != '\n' && curChar != sep) |
| 68 | curChar = fgetc(ifile); |
| 69 | position++; |
| 70 | |
| 71 | if (curChar == '\n' || feof(ifile)) |
| 72 | { |
| 73 | // Reached end of line |
| 74 | break; |
| 75 | } |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | // Main job: parse a data file into a conventional CSV file in rows, without header |
| 80 | // Current limitations: |
| 81 | // - remove partial series (we could fill missing values instead) |
| 82 | // - consider missing fields == 0 (if ,,) |
| 83 | // - IDs should be st. pos. integers |
| 84 | // - UNIX linebreaks only (\n) |
| 85 | int transform(const char* ifileName, int posID, int posValue, |
| 86 | const char* ofileName, int nbItems, char sep) |
| 87 | { |
| 88 | uint64_t processedLines = 0; //execution trace |
| 89 | uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; |
| 90 | int tsLength, lastID=0, ID, firstID, eof; |
| 91 | float value, tmpVal; |
| 92 | Vector* tsBuffer = vector_new(float); |
| 93 | FILE* ifile = fopen(ifileName, "r"); |
| 94 | // Output file to write time-series sequentially, CSV format. |
| 95 | FILE* ofile = fopen(ofileName, "w"); |
| 96 | |
| 97 | // Skip header |
| 98 | char curChar; |
| 99 | do |
| 100 | curChar = fgetc(ifile); |
| 101 | while (curChar != '\n'); |
| 102 | |
| 103 | // Process one client (ID in first column) at a time |
| 104 | while (1) |
| 105 | { |
| 106 | |
| 107 | eof = feof(ifile); |
| 108 | if (!eof) |
| 109 | { |
| 110 | // Is there anything left to read? (file may end with '\n') |
| 111 | curChar = fgetc(ifile); |
| 112 | if (!feof(ifile) && curChar != '\n') |
| 113 | { |
| 114 | // Yes: read current line |
| 115 | ungetc(curChar, ifile); |
| 116 | scan_line(ifile, sep, posID, &ID, posValue, &value); |
| 117 | } |
| 118 | else |
| 119 | eof = 1; |
| 120 | } |
| 121 | |
| 122 | if (ID != lastID || eof) |
| 123 | { |
| 124 | if (lastID > 0) |
| 125 | { |
| 126 | // Just starting a new time-series (or EOF): process the last one |
| 127 | if (tsLength == vector_size(tsBuffer)) |
| 128 | { |
| 129 | for (int i=0; i<tsLength-1; i++) |
| 130 | { |
| 131 | vector_get(tsBuffer, i, tmpVal); |
| 132 | fprintf(ofile, "%g%c", tmpVal, sep); |
| 133 | } |
| 134 | vector_get(tsBuffer, tsLength-1, tmpVal); |
| 135 | fprintf(ofile, "%g\n", tmpVal); |
| 136 | seriesCount++; |
| 137 | if (nbItems > 0 && ++seriesCount >= nbItems) |
| 138 | break; |
| 139 | } |
| 140 | else |
| 141 | { |
| 142 | // Mismatch lengths: skip series |
| 143 | mismatchLengthCount++; |
| 144 | } |
| 145 | } |
| 146 | else |
| 147 | firstID = ID; |
| 148 | if (eof) |
| 149 | { |
| 150 | // Last serie is processed |
| 151 | break; |
| 152 | } |
| 153 | // Reinitialize current index of new serie |
| 154 | tsLength = 0; |
| 155 | lastID = ID; |
| 156 | } |
| 157 | |
| 158 | // Fill values buffer |
| 159 | if (ID != firstID) |
| 160 | { |
| 161 | if (tsLength < vector_size(tsBuffer)) |
| 162 | vector_set(tsBuffer, tsLength, value); |
| 163 | } |
| 164 | else |
| 165 | { |
| 166 | // First serie is reference: push all values |
| 167 | vector_push(tsBuffer, value); |
| 168 | } |
| 169 | tsLength++; |
| 170 | |
| 171 | if ((++processedLines) % 1000000 == 0) |
| 172 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); |
| 173 | } |
| 174 | |
| 175 | // finally print some statistics |
| 176 | fprintf(stdout,"NOTE: %u series retrieved.\n",seriesCount); |
| 177 | if (mismatchLengthCount > 0) |
| 178 | fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); |
| 179 | |
| 180 | fclose(ifile); |
| 181 | fclose(ofile); |
| 182 | return 0; |
| 183 | } |
| 184 | |
| 185 | int main(int argc, char** argv) |
| 186 | { |
| 187 | if (argc < 4) //program name + 3 arguments |
| 188 | { |
| 189 | printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ |
| 190 | - ifileName: name of by-columns CSV input file\n \ |
| 191 | - posID: position of the identifier in a line (start at 1)\n \ |
| 192 | - posValue: position of the value of interest in a line\n \ |
| 193 | - ofileName: name of the output file; default: out.csv\n \ |
| 194 | - nbItems: number of series to retrieve; default: 0 (all)\n \ |
| 195 | - sep: fields separator; default: ','\n"); |
| 196 | return 0; |
| 197 | } |
| 198 | else |
| 199 | { |
| 200 | return transform(argv[1], atoi(argv[2]), atoi(argv[3]), |
| 201 | argc > 4 ? argv[4] : "out.csv", |
| 202 | argc > 5 ? atoi(argv[5]) : 0, |
| 203 | argc > 6 ? argv[6][0] : ','); |
| 204 | } |
| 205 | } |