#define __STDC_FORMAT_MACROS //to print 64bits unsigned integers #include #include #include #include #include // Read an integer char by char, and position the cursor to next character char readInt(FILE* stream, int* integer) { *integer = 0; char nextChar = fgetc(stream); int sign = (nextChar == '-' ? -1 : 1); while (nextChar < '0' || nextChar > '9') nextChar = fgetc(stream); ungetc(nextChar, stream); while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9') *integer = 10 * (*integer) + (int) (nextChar - '0'); (*integer) *= sign; return nextChar; } // Read a real number char by char, and position the cursor to next character char readReal(FILE* stream, float* real) { int integerPart; char nextChar = readInt(stream, &integerPart); int fractionalPart = 0; int countZeros = 0; if (nextChar == '.') { //need to count zeros while ((nextChar = fgetc(stream)) == '0') countZeros++; if (nextChar >= '1' && nextChar <= '9') { ungetc(nextChar, stream); nextChar = readInt(stream, &fractionalPart); } } int exponent = 0; if (nextChar == 'e' || nextChar == 'E') nextChar = readInt(stream, &exponent); *real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart / pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) ) * pow(10,exponent); return nextChar; } // Parse a line into integer+float (ID, value) static void scan_line(FILE* ifile, char sep, int posID, int* ID, int posValue, float* value) { char nextChar; int position = 1; while (1) { if (position == posID) nextChar = readInt(ifile, ID); else if (position == posValue) nextChar = readReal(ifile, value); else nextChar = fgetc(ifile); //erase the comma (and skip field then) // Continue until next separator (or line end or file end) while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep) nextChar = fgetc(ifile); position++; if (feof(ifile) || nextChar == '\n' || nextChar == '\r') { // Skip all potential line feeds while (!feof(ifile) && nextChar == '\n' || nextChar == '\r') nextChar = fgetc(ifile); if (!feof(ifile)) ungetc(nextChar, ifile); break; } } } // Main job: parse a data file into a conventional CSV file in rows, without header // Current limitations: // - remove partial series (we could fill missing values instead) // - consider missing fields == 0 // - IDs should be integers int transform(const char* ifileName, int posID, int posValue, const char* ofileName, int nbItems, char sep) { FILE* ifile = fopen(ifileName, "r"); // Output file to write time-series sequentially, CSV format. FILE* ofile = fopen(ofileName, "w"); // Skip header char curChar; do curChar = fgetc(ifile); while (!feof(ifile) && curChar != '\n' && curChar != '\r'); // Process one client (ID in first column) at a time uint64_t processedLines = 0; //execution trace uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0; int tsLength=0, refTsLength=0, lastID=0, ID=0; float value=0., tsBuffer[refTsLength]; while (!feof(ifile)) { // Go to next line while (!feof(ifile) && (curChar == '\n' || curChar == '\r')) curChar = fgetc(ifile); if (feof(ifile)) break; ungetc(curChar, ifile); // Read current line scan_line(ifile, sep, posID, &ID, posValue, &value); if (ID != lastID) { // Just starting a new time-series: must process the last one (if exists !) if (lastID > 0) { if (refTsLength == 0) refTsLength = tsLength; //first serie is considered clean if (tsLength == refTsLength) { for (int i=0; i 0 && ++seriesCount >= nbItems) break; } // Mismatch lengths: skip series else mismatchLengthCount++; } // reinitialize flags tsLength = 0; lastID = ID; } //We cannot write more than refTsLength values if (tsLength < refTsLength) tsBuffer[tsLength++] = value; if ((++processedLines) % 1000000 == 0) fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines); } if (nbItems <= 0 || seriesCount < nbItems) { // flush last time-series if all conditions are met if (tsLength == refTsLength) { for (int i=0; i 0) fprintf(stdout,"WARNING: %u mismatch series lengths.\n",mismatchLengthCount); fclose(ifile); fclose(ofile); return 0; } int main(int argc, char** argv) { if (argc < 4) //program name + 3 arguments { printf("Usage: transform ifileName posID posValue [ofileName [nbItems [sep]]]\n \ - ifileName: name of by-columns CSV input file\n \ - posID: position of the identifier in a line (start at 1)\n \ - posValue: position of the value of interest in a line\n \ - ofileName: name of the output file; default: out.csv\n \ - nbItems: number of series to retrieve; default: 0 (all)\n \ - sep: fields separator; default: ','\n"); return 0; } else { return transform(argv[1], atoi(argv[2]), atoi(argv[3]), argc > 4 ? argv[4] : "out.csv", argc > 5 ? atoi(argv[5]) : 0, argc > 6 ? argv[6][0] : ','); } }