| 1 | #define __STDC_FORMAT_MACROS //to print 64bits unsigned integers |
| 2 | #include <inttypes.h> |
| 3 | #include <cgds/Vector.h> |
| 4 | #include <string.h> |
| 5 | #include <math.h> |
| 6 | #include <float.h> |
| 7 | |
| 8 | char readInt(FILE* stream, int64_t* integer) |
| 9 | { |
| 10 | *integer = 0; |
| 11 | char curChar = fgetc(stream); |
| 12 | int sign = (curChar == '-' ? -1 : 1); |
| 13 | while (curChar < '0' || curChar > '9') |
| 14 | curChar = fgetc(stream); |
| 15 | ungetc(curChar, stream); |
| 16 | while ((curChar = fgetc(stream)) >= '0' && curChar <= '9') |
| 17 | *integer = 10 * (*integer) + (int64_t) (curChar - '0'); |
| 18 | (*integer) *= sign; |
| 19 | return curChar; |
| 20 | } |
| 21 | |
| 22 | char readReal(FILE* stream, float* real) |
| 23 | { |
| 24 | int64_t integerPart; |
| 25 | char nextChar = readInt(stream, &integerPart); |
| 26 | int64_t fractionalPart = 0; |
| 27 | int countZeros = 0; |
| 28 | if (nextChar == '.') |
| 29 | { |
| 30 | //need to count zeros |
| 31 | while ((nextChar = fgetc(stream)) == '0') |
| 32 | countZeros++; |
| 33 | if (nextChar >= '1' && nextChar <= '9') |
| 34 | { |
| 35 | ungetc(nextChar, stream); |
| 36 | nextChar = readInt(stream, &fractionalPart); |
| 37 | } |
| 38 | } |
| 39 | int64_t exponent = 0; |
| 40 | if (nextChar == 'e' || nextChar == 'E') |
| 41 | nextChar = readInt(stream, &exponent); |
| 42 | *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart |
| 43 | / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) |
| 44 | * pow(10,exponent); |
| 45 | return nextChar; |
| 46 | } |
| 47 | |
| 48 | // Parse a line into integer+float (ID, raw power) |
| 49 | static void scan_line(FILE* ifile, |
| 50 | int posTime, uint32_t* time, int posID, uint32_t* ID, int posPower, float* rawPower) |
| 51 | { |
| 52 | char nextChar; |
| 53 | int position = 1; |
| 54 | while (1) |
| 55 | { |
| 56 | if (position == posTime) |
| 57 | { |
| 58 | //TODO: go to 01JAN2009:00:00:00 and convert to integer (0, 1, ...) |
| 59 | } |
| 60 | else if (position == posID) |
| 61 | { |
| 62 | int64_t ID_on64bits; |
| 63 | nextChar = readInt(ifile, &ID_on64bits); |
| 64 | *ID = (uint32_t)ID_on64bits; |
| 65 | } |
| 66 | else if (position == posPower) |
| 67 | { |
| 68 | float power = FLT_MAX; //"NA" |
| 69 | nextChar = readReal(ifile, &power); //?? WARNING here... if empty field ?! |
| 70 | *rawPower = (float) power; |
| 71 | } |
| 72 | else |
| 73 | //erase the comma (and skip field then) |
| 74 | nextChar = fgetc(ifile); |
| 75 | |
| 76 | //continue until next comma (or line end or file end) |
| 77 | while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != ',') |
| 78 | nextChar = fgetc(ifile); |
| 79 | position++; |
| 80 | |
| 81 | if (feof(ifile) || nextChar == '\n' || nextChar == '\r') |
| 82 | { |
| 83 | // skip all potential line feeds |
| 84 | while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') |
| 85 | nextChar = fgetc(ifile); |
| 86 | if (!feof(ifile)) |
| 87 | ungetc(nextChar, ifile); |
| 88 | break; |
| 89 | } |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | |
| 94 | //TODO: check datetime at each line (build datetimes file ! for each year ?) |
| 95 | //also fill NA with closest value in file (easy) |
| 96 | //01JAN2009:00:00:00 .......... |
| 97 | |
| 98 | |
| 99 | // Main job: parse a data file into a conventional CSV file in rows, without header |
| 100 | void transform(const char* ifileName, int posID, int posTime, int posValue, |
| 101 | char* firstTime, char* lastTime, const char* ofileName, int nbItems) //uint32_t nbItems |
| 102 | { |
| 103 | //TODO: complete timedate vector from first_time and last_time |
| 104 | // --> this gives (expected) tsLength for free |
| 105 | |
| 106 | FILE* ifile = fopen(ifileName, "r"); |
| 107 | // output file to write time-series sequentially, CSV format. |
| 108 | FILE* ofile = fopen(ofileName, "w"); |
| 109 | |
| 110 | // Skip header |
| 111 | char nextChar; |
| 112 | do |
| 113 | nextChar = fgetc(ifile); |
| 114 | while (!feof(ifile) && nextChar != '\n' && nextChar != '\r') |
| 115 | |
| 116 | // process one client (ID in first column) at a time |
| 117 | uint64_t processedLines = 0; //execution trace |
| 118 | uint32_t seriesCount=0, skippedSeriesCount=0, tsLength=0; |
| 119 | uint32_t mismatchLengthCount=0; |
| 120 | float tsBuffer[refTsLength]; |
| 121 | lastID = 0; |
| 122 | while (!feof(ifile)) |
| 123 | { |
| 124 | // next element to read always start with a digit |
| 125 | do |
| 126 | curChar = fgetc(ifile); |
| 127 | while (!feof(ifile) && (curChar < '0' || curChar > '9')); |
| 128 | if (feof(ifile)) |
| 129 | break; |
| 130 | ungetc(curChar, ifile); |
| 131 | |
| 132 | // read line |
| 133 | scan_line(ifile, posID, &ID, posPower, &rawPower); |
| 134 | if (ID != lastID) |
| 135 | { |
| 136 | //just starting a new time-series: must process the last one (if there is a last one !) |
| 137 | if (lastID > 0) |
| 138 | { |
| 139 | if (tsLength == refTsLength) |
| 140 | { |
| 141 | for (int i=0; i<tsLength; i++) |
| 142 | { |
| 143 | char* format = i<tsLength-1 ? "%g," : "%g"; |
| 144 | fprintf(ofile, format, tsBuffer[i]); |
| 145 | } |
| 146 | fprintf(ofile, "\n"); |
| 147 | if (nbItems > 0 && ++seriesCount >= nbItems) |
| 148 | break; |
| 149 | } |
| 150 | //if something wrong happened, skip series |
| 151 | else |
| 152 | { |
| 153 | skippedSeriesCount++; |
| 154 | if (tsLength != refTsLength) |
| 155 | mismatchLengthCount++; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | // reinitialize flags |
| 160 | tsLength = 0; |
| 161 | lastID = ID; |
| 162 | } |
| 163 | |
| 164 | //We cannot write more than refTsLength values |
| 165 | if (tsLength < refTsLength) |
| 166 | tsBuffer[tsLength++] = rawPower; |
| 167 | |
| 168 | if ((++processedLines) % 1000000 == 0) |
| 169 | fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); |
| 170 | } |
| 171 | |
| 172 | if (tsLength == refTsLength && (nbItems <= 0 || seriesCount < nbItems)) |
| 173 | { |
| 174 | // flush last time-series if all conditions are met |
| 175 | for (int i=0; i<tsLength; i++) |
| 176 | { |
| 177 | char* format = i<tsLength-1 ? "%g," : "%g"; |
| 178 | fprintf(ofile, format, tsBuffer[i]); |
| 179 | } |
| 180 | fprintf(ofile, "\n"); |
| 181 | seriesCount++; |
| 182 | } |
| 183 | else if (nbItems <= 0 || seriesCount < nbItems) |
| 184 | { |
| 185 | if (tsLength != refTsLength) |
| 186 | mismatchLengthCount++; |
| 187 | } |
| 188 | |
| 189 | // finally print some statistics |
| 190 | if (seriesCount < nbItems) |
| 191 | fprintf(stdout,"Warning: only %u series retrieved.\n",seriesCount); |
| 192 | fprintf(stdout,"%u mismatch series lengths.\n",mismatchLengthCount); |
| 193 | |
| 194 | fclose(ifile); |
| 195 | fclose(ofile); |
| 196 | } |
| 197 | |
| 198 | int main(char** argv, int argc) |
| 199 | { |
| 200 | //TODO: args checks... |
| 201 | transform(argv[1], atoi(argv[2]), atoi(argv[3]), atoi(argv[4]), |
| 202 | argv[5], argv[6], argv[7], atoi(argv[8])); |
| 203 | return 0; |
| 204 | } |