1 #include "MPI_Main/master.h"
2 #include "MPI_Main/slave.h"
3 #include "Util/utils.h"
9 #include "TimeSeries/serialize.h"
10 #include "TimeSeries/deserialize.h"
11 #include "Classification/getClass.h"
13 #include <cgds/Vector.h>
14 #include <libxml/xmlreader.h>
16 // serialize text file argv[1] into a binary file argv[2]
17 int serialize_main(int argc
, char** argv
)
19 const char* ifileName
= argv
[1];
20 const char* ofileName
= argv
[2];
21 int byCols
= atoi(argv
[3]);
22 uint32_t nbItems
= atoi(argv
[4]); //==0 for "all series"
25 serialize_byCols(ifileName
, ofileName
, nbItems
);
27 serialize_byRows(ifileName
, ofileName
, nbItems
);
31 // deserialize binary file argv[1] into text file argv[2]
32 int deserialize_main(int argc
, char** argv
)
34 const char* ifileName
= argv
[1];
35 const char* ofileName
= argv
[2];
36 Vector
* vranks
= vector_new(uint32_t);
37 //each token is at most two ints (a-b = from a to b included)
38 char* token
= strtok(argv
[3], ",");
40 uint32_t* ranks
= NULL
;
43 //scan token to find middle position of '-' (if any)
45 int tokenLength
= strlen(token
);
46 //loop starts at index 1 because -N is allowed (and means 'everything')
47 for (int i
=1; i
<tokenLength
; i
++)
57 int64_t rank
= (int64_t)atoi(token
);
63 vector_push(vranks
, (uint32_t)rank
);
68 int int1
= atoi(token
);
69 int int2
= atoi(token
+minusPos
+1);
70 for (uint32_t i
=int1
; i
<=int2
; i
++)
71 vector_push(vranks
, i
);
73 token
= strtok(NULL
, ",");
75 uint32_t nbRanks
= retrieveAll
77 : vector_size(vranks
);
80 ranks
= (uint32_t*) malloc(nbRanks
*sizeof(uint32_t));
81 for (uint32_t i
=0; i
<nbRanks
; i
++)
83 vector_get(vranks
, i
, ranks
[i
]);
84 ranks
[i
]--; //re-express on 0...{n-1}
87 vector_destroy(vranks
);
89 deserialize(ifileName
, ofileName
, ranks
, nbRanks
);
93 //main clustering task (master or slave)
94 int cluster_main(int argc
, char **argv
)
96 MPI_Init(&argc
, &argv
);
98 char* ifileName
= argv
[1]; //could be "../data/test.bin"
99 uint32_t nbSeriesInChunk
= atoi(argv
[2]); //could be 3000
100 uint32_t nbClusters
= atoi(argv
[3]); //could be 15
101 int randomize
= atoi(argv
[4]); //boolean
102 uint32_t p_for_dissims
= atoi(argv
[5]); //1 for L1, 2 for L2, ...etc
104 // Get totalNbSeries and tsLength
105 uint32_t totalNbSeries
= get_nbSeries(ifileName
);
106 uint32_t tsLength
= get_tsLength(ifileName
);
108 // Basic sanity checks
109 if (nbClusters
<= 0 || nbSeriesInChunk
<= 1)
114 if (nbSeriesInChunk
> totalNbSeries
)
115 nbSeriesInChunk
= totalNbSeries
;
116 if (nbClusters
> nbSeriesInChunk
)
117 nbClusters
= nbSeriesInChunk
;
119 double idealNbSeriesInChunk
= 0.0; //unused if randomize == TRUE
122 // Adjust nbSeriesInChunk to avoid small remainders.
123 // Each node should have at least nbSeriesInChunk (as given to the function).
125 // ==> We seek for the largest N such that (double)totalNbSeries / N >= nbSeriesInChunk
126 uint32_t N
= totalNbSeries
/ nbSeriesInChunk
+ 1;
127 while ((double)totalNbSeries
/ N
< nbSeriesInChunk
) N
--;
128 // At this point N>=1 is the solution
129 idealNbSeriesInChunk
= (double)totalNbSeries
/ N
;
130 nbSeriesInChunk
= ceil(idealNbSeriesInChunk
);
133 // Initialize random generator
136 // Find out my identity in the default communicator
138 MPI_Comm_rank(MPI_COMM_WORLD
, &myrank
);
142 // create temporary folder for intermediate results
143 mkdir(".tmp", S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
145 master_run(ifileName
, totalNbSeries
, nbSeriesInChunk
, idealNbSeriesInChunk
,
146 tsLength
, nbClusters
, randomize
, p_for_dissims
);
150 slave_run(myrank
, nbSeriesInChunk
, nbClusters
);
156 //main classification task (using clustering result)
157 //NOTE: ifileName == courbes à classer (?!)
158 int classif_main(int argc
, char** argv
)
160 const char* ifileName
= argv
[1];
161 const char* xmlFileName
= argv
[2];
163 // 1] load and retrieve info from XML file
164 xmlDoc
* doc
= xmlReadFile(xmlFileName
, NULL
, 0);
166 // Get the root element node
167 xmlNode
* root_element
= xmlDocGetRootElement(doc
);
169 uint32_t p_for_dissims
= 0;
170 uint32_t* ranks
= NULL
;
171 uint32_t nbClusters
= 0;
173 for (xmlNode
* cur_node
=root_element
->children
; cur_node
; cur_node
=cur_node
->next
)
175 if (cur_node
->type
!= XML_ELEMENT_NODE
)
177 if (!strcmp(cur_node
->name
,"p_for_dissims"))
178 p_for_dissims
= atoi(cur_node
->last
->content
);
179 else if (!strcmp(cur_node
->name
,"ranks"))
181 //first pass: find nbClusters
182 for (xmlNode
* rankNode
=cur_node
->children
; rankNode
; rankNode
=rankNode
->next
)
184 if (rankNode
->type
== XML_ELEMENT_NODE
&& !strcmp(rankNode
->name
,"rank"))
187 //second pass: fill ranks (not optimal, but not very important here)
188 ranks
= (uint32_t*) malloc(nbClusters
*sizeof(uint32_t));
190 for (xmlNode
* rankNode
=cur_node
->children
; rankNode
; rankNode
=rankNode
->next
)
192 if (rankNode
->type
== XML_ELEMENT_NODE
&& !strcmp(rankNode
->name
,"rank"))
193 ranks
[index
++] = atoi(rankNode
->last
->content
) - 1;
196 else if (!strcmp(cur_node
->name
,"file"))
198 binFileName
= (char*) malloc(strlen(cur_node
->last
->content
)+1);
199 strcpy(binFileName
, cur_node
->last
->content
);
206 uint32_t tsLength1
= get_tsLength(ifileName
);
207 uint32_t tsLength2
= get_tsLength(binFileName
);
208 if (tsLength1
!= tsLength2
)
210 fprintf(stderr
,"Warning: nbValues do not match. Data will be truncated.\n");
211 if (tsLength1
> tsLength2
)
212 tsLength1
= tsLength2
;
214 uint32_t nbValues
= (tsLength1
- 4) / 4;
216 // 2] Classify all series by batches of CURVES_PER_REQUEST
217 uint32_t nbSeries
= get_nbSeries(ifileName
);
218 PowerCurve
* medoids
= deserialize(binFileName
, NULL
, ranks
, nbClusters
);
221 ranks
= (uint32_t*)malloc(CURVES_PER_REQUEST
*sizeof(uint32_t));
223 uint32_t smallestNonProcessedIndex
= 0;
225 FILE* labelsFile
= fopen("LABELS", "w");
226 while (smallestNonProcessedIndex
< nbSeries
)
228 uint32_t lowerBound
= smallestNonProcessedIndex
;
229 uint32_t upperBound
= smallestNonProcessedIndex
+ CURVES_PER_REQUEST
;
230 if (upperBound
> nbSeries
)
231 upperBound
= nbSeries
;
232 for (uint32_t i
=0; i
<upperBound
-lowerBound
; i
++)
233 ranks
[i
] = lowerBound
+ i
;
234 PowerCurve
* data
= deserialize(ifileName
, NULL
, ranks
, upperBound
-lowerBound
);
235 uint32_t* labels
= get_class(data
, upperBound
-lowerBound
, medoids
, nbClusters
,
236 nbValues
, p_for_dissims
, &DISTOR
);
237 // send labels to LABELS file
238 for (uint32_t i
=0; i
<upperBound
-lowerBound
; i
++)
240 free(data
[i
].values
);
241 fprintf(labelsFile
, "%u\n",labels
[i
]);
245 smallestNonProcessedIndex
+= (upperBound
-lowerBound
);
248 for (uint32_t i
=0; i
<nbClusters
; i
++)
249 free(medoids
[i
].values
);
252 fprintf(stdout
, "DISTOR = %g\n",DISTOR
);
256 int main(int argc
, char** argv
)
260 fprintf(stderr
, "No argument provided. Exit.\n");
264 if (!strcmp(argv
[1], "serialize"))
265 return serialize_main(argc
-1, argv
+1);
266 if (!strcmp(argv
[1], "deserialize"))
267 return deserialize_main(argc
-1, argv
+1);
268 if (!strcmp(argv
[1], "cluster"))
269 return cluster_main(argc
-1, argv
+1);
270 if (!strcmp(argv
[1], "classif"))
271 return classif_main(argc
-1, argv
+1);
273 fprintf(stderr
, "Unknown first argument.\n");