+++ /dev/null
-#include "MPI_Main/master.h"
-#include "MPI_Main/slave.h"
-#include "Util/utils.h"
-#include "Util/rng.h"
-#include <sys/stat.h>
-#include <mpi.h>
-#include <math.h>
-#include <stdio.h>
-#include "TimeSeries/serialize.h"
-#include "TimeSeries/deserialize.h"
-#include "Classification/getClass.h"
-#include <string.h>
-#include <cgds/Vector.h>
-#include <libxml/xmlreader.h>
-
-// serialize text file argv[1] into a binary file argv[2]
-int serialize_main(int argc, char** argv)
-{
- const char* ifileName = argv[1];
- const char* ofileName = argv[2];
- int byCols = atoi(argv[3]);
- uint32_t nbItems = atoi(argv[4]); //==0 for "all series"
-
- if (byCols)
- serialize_byCols(ifileName, ofileName, nbItems);
- else
- serialize_byRows(ifileName, ofileName, nbItems);
- return 0;
-}
-
-// deserialize binary file argv[1] into text file argv[2]
-int deserialize_main(int argc, char** argv)
-{
- const char* ifileName = argv[1];
- const char* ofileName = argv[2];
- Vector* vranks = vector_new(uint32_t);
- //each token is at most two ints (a-b = from a to b included)
- char* token = strtok(argv[3], ",");
- int retrieveAll = 0;
- uint32_t* ranks = NULL;
- while (token)
- {
- //scan token to find middle position of '-' (if any)
- int minusPos = -1;
- int tokenLength = strlen(token);
- //loop starts at index 1 because -N is allowed (and means 'everything')
- for (int i=1; i<tokenLength; i++)
- {
- if (token[i] == '-')
- {
- minusPos = i;
- break;
- }
- }
- if (minusPos < 0)
- {
- int64_t rank = (int64_t)atoi(token);
- if (rank <= 0)
- {
- retrieveAll = 1;
- break;
- }
- vector_push(vranks, (uint32_t)rank);
- }
- else
- {
- token[minusPos] = 0;
- int int1 = atoi(token);
- int int2 = atoi(token+minusPos+1);
- for (uint32_t i=int1; i<=int2; i++)
- vector_push(vranks, i);
- }
- token = strtok(NULL, ",");
- }
- uint32_t nbRanks = retrieveAll
- ? 0
- : vector_size(vranks);
- if (!retrieveAll)
- {
- ranks = (uint32_t*) malloc(nbRanks*sizeof(uint32_t));
- for (uint32_t i=0; i<nbRanks; i++)
- {
- vector_get(vranks, i, ranks[i]);
- ranks[i]--; //re-express on 0...{n-1}
- }
- }
- vector_destroy(vranks);
-
- deserialize(ifileName, ofileName, ranks, nbRanks);
- return 0;
-}
-
-//main clustering task (master or slave)
-int cluster_main(int argc, char **argv)
-{
- MPI_Init(&argc, &argv);
-
- char* ifileName = argv[1]; //could be "../data/test.bin"
- uint32_t nbSeriesInChunk = atoi(argv[2]); //could be 3000
- uint32_t nbClusters = atoi(argv[3]); //could be 15
- int randomize = atoi(argv[4]); //boolean
- uint32_t p_for_dissims = atoi(argv[5]); //1 for L1, 2 for L2, ...etc
-
- // Get totalNbSeries and tsLength
- uint32_t totalNbSeries = get_nbSeries(ifileName);
- uint32_t tsLength = get_tsLength(ifileName);
-
- // Basic sanity checks
- if (nbClusters <= 0 || nbSeriesInChunk <= 1)
- {
- MPI_Finalize();
- return 0;
- }
- if (nbSeriesInChunk > totalNbSeries)
- nbSeriesInChunk = totalNbSeries;
- if (nbClusters > nbSeriesInChunk)
- nbClusters = nbSeriesInChunk;
-
- double idealNbSeriesInChunk = 0.0; //unused if randomize == TRUE
- if (!randomize)
- {
- // Adjust nbSeriesInChunk to avoid small remainders.
- // Each node should have at least nbSeriesInChunk (as given to the function).
-
- // ==> We seek for the largest N such that (double)totalNbSeries / N >= nbSeriesInChunk
- uint32_t N = totalNbSeries / nbSeriesInChunk + 1;
- while ((double)totalNbSeries / N < nbSeriesInChunk) N--;
- // At this point N>=1 is the solution
- idealNbSeriesInChunk = (double)totalNbSeries / N;
- nbSeriesInChunk = ceil(idealNbSeriesInChunk);
- }
-
- // Initialize random generator
- init_rng(1);
-
- // Find out my identity in the default communicator
- int myrank;
- MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
- if (myrank == 0)
- {
- // create temporary folder for intermediate results
- mkdir(".tmp", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
-
- master_run(ifileName, totalNbSeries, nbSeriesInChunk, idealNbSeriesInChunk,
- tsLength, nbClusters, randomize, p_for_dissims);
- }
-
- else
- slave_run(myrank, nbSeriesInChunk, nbClusters);
-
- MPI_Finalize();
- return 0;
-}
-
-//main classification task (using clustering result)
-//NOTE: ifileName == courbes à classer (?!)
-int classif_main(int argc, char** argv)
-{
- const char* ifileName = argv[1];
- const char* xmlFileName = argv[2];
-
- // 1] load and retrieve info from XML file
- xmlDoc* doc = xmlReadFile(xmlFileName, NULL, 0);
-
- // Get the root element node
- xmlNode* root_element = xmlDocGetRootElement(doc);
-
- uint32_t p_for_dissims = 0;
- uint32_t* ranks = NULL;
- uint32_t nbClusters = 0;
- char* binFileName;
- for (xmlNode* cur_node=root_element->children; cur_node; cur_node=cur_node->next)
- {
- if (cur_node->type != XML_ELEMENT_NODE)
- continue;
- if (!strcmp(cur_node->name,"p_for_dissims"))
- p_for_dissims = atoi(cur_node->last->content);
- else if (!strcmp(cur_node->name,"ranks"))
- {
- //first pass: find nbClusters
- for (xmlNode* rankNode=cur_node->children; rankNode; rankNode=rankNode->next)
- {
- if (rankNode->type == XML_ELEMENT_NODE && !strcmp(rankNode->name,"rank"))
- nbClusters++;
- }
- //second pass: fill ranks (not optimal, but not very important here)
- ranks = (uint32_t*) malloc(nbClusters*sizeof(uint32_t));
- uint32_t index = 0;
- for (xmlNode* rankNode=cur_node->children; rankNode; rankNode=rankNode->next)
- {
- if (rankNode->type == XML_ELEMENT_NODE && !strcmp(rankNode->name,"rank"))
- ranks[index++] = atoi(rankNode->last->content) - 1;
- }
- }
- else if (!strcmp(cur_node->name,"file"))
- {
- binFileName = (char*) malloc(strlen(cur_node->last->content)+1);
- strcpy(binFileName, cur_node->last->content);
- }
- }
-
- xmlFreeDoc(doc);
- xmlCleanupParser();
-
- uint32_t tsLength1 = get_tsLength(ifileName);
- uint32_t tsLength2 = get_tsLength(binFileName);
- if (tsLength1 != tsLength2)
- {
- fprintf(stderr,"Warning: nbValues do not match. Data will be truncated.\n");
- if (tsLength1 > tsLength2)
- tsLength1 = tsLength2;
- }
- uint32_t nbValues = (tsLength1 - 4) / 4;
-
- // 2] Classify all series by batches of CURVES_PER_REQUEST
- uint32_t nbSeries = get_nbSeries(ifileName);
- PowerCurve* medoids = deserialize(binFileName, NULL, ranks, nbClusters);
- free(binFileName);
- free(ranks);
- ranks = (uint32_t*)malloc(CURVES_PER_REQUEST*sizeof(uint32_t));
-
- uint32_t smallestNonProcessedIndex = 0;
- double DISTOR = 0.0;
- FILE* labelsFile = fopen("LABELS", "w");
- while (smallestNonProcessedIndex < nbSeries)
- {
- uint32_t lowerBound = smallestNonProcessedIndex;
- uint32_t upperBound = smallestNonProcessedIndex + CURVES_PER_REQUEST;
- if (upperBound > nbSeries)
- upperBound = nbSeries;
- for (uint32_t i=0; i<upperBound-lowerBound; i++)
- ranks[i] = lowerBound + i;
- PowerCurve* data = deserialize(ifileName, NULL, ranks, upperBound-lowerBound);
- uint32_t* labels = get_class(data, upperBound-lowerBound, medoids, nbClusters,
- nbValues, p_for_dissims, &DISTOR);
- // send labels to LABELS file
- for (uint32_t i=0; i<upperBound-lowerBound; i++)
- {
- free(data[i].values);
- fprintf(labelsFile, "%u\n",labels[i]);
- }
- free(data);
- free(labels);
- smallestNonProcessedIndex += (upperBound-lowerBound);
- }
- fclose(labelsFile);
- for (uint32_t i=0; i<nbClusters; i++)
- free(medoids[i].values);
- free(medoids);
- free(ranks);
- fprintf(stdout, "DISTOR = %g\n",DISTOR);
- return 0;
-}
-
-int main(int argc, char** argv)
-{
- if (argc <= 1)
- {
- fprintf(stderr, "No argument provided. Exit.\n");
- return 1;
- }
-
- if (!strcmp(argv[1], "serialize"))
- return serialize_main(argc-1, argv+1);
- if (!strcmp(argv[1], "deserialize"))
- return deserialize_main(argc-1, argv+1);
- if (!strcmp(argv[1], "cluster"))
- return cluster_main(argc-1, argv+1);
- if (!strcmp(argv[1], "classif"))
- return classif_main(argc-1, argv+1);
-
- fprintf(stderr, "Unknown first argument.\n");
- return 1;
-}