char readInt(FILE* stream, int* integer)
{
*integer = 0;
- char nextChar = fgetc(stream);
- int sign = (nextChar == '-' ? -1 : 1);
- while (nextChar < '0' || nextChar > '9')
- nextChar = fgetc(stream);
- ungetc(nextChar, stream);
- while ((nextChar = fgetc(stream)) >= '0' && nextChar <= '9')
+ char curChar = fgetc(stream);
+ int sign = (curChar == '-' ? -1 : 1);
+ while (curChar < '0' || curChar > '9')
+ curChar = fgetc(stream);
+ while (curChar >= '0' && curChar <= '9')
{
-
- printf("next char: %c\n",nextChar);
-
- *integer = 10 * (*integer) + (int) (nextChar - '0'); }
+ *integer = 10 * (*integer) + (int) (curChar - '0');
+ curChar = fgetc(stream);
+ }
(*integer) *= sign;
-printf("INTEGER: %i\n",*integer);
- return nextChar;
+ return curChar; //separator, endline or .,e,E (if inside readReal)
}
// Read a real number char by char, and position the cursor to next character
char readReal(FILE* stream, float* real)
{
- int integerPart;
- char nextChar = readInt(stream, &integerPart);
- int fractionalPart = 0;
- int countZeros = 0;
- if (nextChar == '.')
+ int integerPart, exponent = 0, fractionalPart = 0, countZeros = 0;
+ char curChar = readInt(stream, &integerPart);
+ if (curChar == '.')
{
//need to count zeros
- while ((nextChar = fgetc(stream)) == '0')
+ while ((curChar = fgetc(stream)) == '0')
countZeros++;
- if (nextChar >= '1' && nextChar <= '9')
+ if (curChar >= '1' && curChar <= '9')
{
- ungetc(nextChar, stream);
- nextChar = readInt(stream, &fractionalPart);
+ ungetc(curChar, stream);
+ curChar = readInt(stream, &fractionalPart);
}
}
- int exponent = 0;
- if (nextChar == 'e' || nextChar == 'E')
- nextChar = readInt(stream, &exponent);
+ if (curChar == 'e' || curChar == 'E')
+ curChar = readInt(stream, &exponent);
*real = ( integerPart + (integerPart>0 ? 1. : -1.) * (float)fractionalPart
/ pow(10,countZeros+floor(log10(fractionalPart>0 ? fractionalPart : 1)+1)) )
* pow(10,exponent);
- return nextChar;
+ return curChar; //separator or endline
}
// Parse a line into integer+float (ID, value)
static void scan_line(FILE* ifile, char sep,
int posID, int* ID, int posValue, float* value)
{
- char nextChar;
+ char curChar;
int position = 1;
while (1)
{
if (position == posID)
- nextChar = readInt(ifile, ID);
+ curChar = readInt(ifile, ID);
else if (position == posValue)
- nextChar = readReal(ifile, value);
+ curChar = readReal(ifile, value);
else
- nextChar = fgetc(ifile); //erase the comma (and skip field then)
+ curChar = fgetc(ifile); //erase the comma (and skip field then)
// Continue until next separator (or line end or file end)
- while (!feof(ifile) && nextChar != '\n' && nextChar != '\r' && nextChar != sep)
- nextChar = fgetc(ifile);
+ while (!feof(ifile) && curChar != '\n' && curChar != sep)
+ curChar = fgetc(ifile);
position++;
- if (feof(ifile) || nextChar == '\n' || nextChar == '\r')
+ if (curChar == '\n' || feof(ifile))
{
- // Skip all potential line feeds
- while (!feof(ifile) && nextChar == '\n' || nextChar == '\r')
- nextChar = fgetc(ifile);
- if (!feof(ifile))
- ungetc(nextChar, ifile);
+ // Reached end of line
break;
}
}
// Main job: parse a data file into a conventional CSV file in rows, without header
// Current limitations:
// - remove partial series (we could fill missing values instead)
-// - consider missing fields == 0
-// - IDs should be integers
+// - consider missing fields == 0 (if ,,)
+// - IDs should be st. pos. integers
+// - UNIX linebreaks only (\n)
int transform(const char* ifileName, int posID, int posValue,
const char* ofileName, int nbItems, char sep)
{
+ uint64_t processedLines = 0; //execution trace
+ uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
+ int tsLength, lastID=0, ID, firstID, eof;
+ float value, tmpVal;
+ Vector* tsBuffer = vector_new(float);
FILE* ifile = fopen(ifileName, "r");
// Output file to write time-series sequentially, CSV format.
FILE* ofile = fopen(ofileName, "w");
char curChar;
do
curChar = fgetc(ifile);
- while (!feof(ifile) && curChar != '\n' && curChar != '\r');
+ while (curChar != '\n');
// Process one client (ID in first column) at a time
- uint64_t processedLines = 0; //execution trace
- uint32_t seriesCount=0, skippedSeriesCount=0, mismatchLengthCount=0;
- int tsLength=0, refTsLength=0, lastID=0, ID=0;
- float value=0., tsBuffer[refTsLength];
- while (!feof(ifile))
+ while (1)
{
- // Go to next line
- while (!feof(ifile) && (curChar == '\n' || curChar == '\r'))
- curChar = fgetc(ifile);
- if (feof(ifile))
- break;
- ungetc(curChar, ifile);
- // Read current line
- scan_line(ifile, sep, posID, &ID, posValue, &value);
+ eof = feof(ifile);
+ if (!eof)
+ {
+ // Is there anything left to read? (file may end with '\n')
+ curChar = fgetc(ifile);
+ if (!feof(ifile) && curChar != '\n')
+ {
+ // Yes: read current line
+ ungetc(curChar, ifile);
+ scan_line(ifile, sep, posID, &ID, posValue, &value);
+ }
+ else
+ eof = 1;
+ }
- printf("SCAN: %i %g\n",ID,value);
- if (ID != lastID)
+ if (ID != lastID || eof)
{
- // Just starting a new time-series: must process the last one (if exists !)
if (lastID > 0)
{
- if (tsLength == refTsLength)
+ // Just starting a new time-series (or EOF): process the last one
+ if (tsLength == vector_size(tsBuffer))
{
- for (int i=0; i<tsLength; i++)
+ for (int i=0; i<tsLength-1; i++)
{
- char* format = i<tsLength-1 ? "%g%c" : "%g";
- fprintf(ofile, format, tsBuffer[i], sep);
+ vector_get(tsBuffer, i, tmpVal);
+ fprintf(ofile, "%g%c", tmpVal, sep);
}
- fprintf(ofile, "\n");
+ vector_get(tsBuffer, tsLength-1, tmpVal);
+ fprintf(ofile, "%g\n", tmpVal);
+ seriesCount++;
if (nbItems > 0 && ++seriesCount >= nbItems)
break;
}
- // Mismatch lengths: skip series
else
+ {
+ // Mismatch lengths: skip series
mismatchLengthCount++;
+ }
}
else
- refTsLength = tsLength; //first serie is considered clean
-
- // reinitialize flags
+ firstID = ID;
+ if (eof)
+ {
+ // Last serie is processed
+ break;
+ }
+ // Reinitialize current index of new serie
tsLength = 0;
lastID = ID;
}
-printf("LA %i %i\n",tsLength,refTsLength);
- //We cannot write more than refTsLength values
- if (tsLength < refTsLength)
- tsBuffer[tsLength++] = value;
-
- if ((++processedLines) % 1000000 == 0)
- fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
- }
- if (nbItems <= 0 || seriesCount < nbItems)
- {
- // flush last time-series if all conditions are met
- if (tsLength == refTsLength)
+ // Fill values buffer
+ if (ID != firstID)
{
- for (int i=0; i<tsLength; i++)
- {
- char* format = i<tsLength-1 ? "%g%c" : "%g";
- fprintf(ofile, format, tsBuffer[i], sep);
- }
- fprintf(ofile, "\n");
- seriesCount++;
+ if (tsLength < vector_size(tsBuffer))
+ vector_set(tsBuffer, tsLength, value);
}
else
- mismatchLengthCount++;
+ {
+ // First serie is reference: push all values
+ vector_push(tsBuffer, value);
+ }
+ tsLength++;
+
+ if ((++processedLines) % 1000000 == 0)
+ fprintf(stdout,"Processed %"PRIu64" lines\n", processedLines);
}
// finally print some statistics