@@ -825,12 +825,14 @@ void clueweb() {
825
825
}
826
826
827
827
// k-tree
828
- if (false ) {
828
+ if (true ) {
829
+ // build tree
829
830
int m = 1000 , maxiters = 10 ;
830
831
cout << " -----" << endl;
831
832
cout << " Building K-tree of order m=" << m
832
833
<< " , k-means maxiters=" << maxiters << endl;
833
834
boost::timer::auto_cpu_timer all;
835
+ boost::timer::auto_cpu_timer building;
834
836
KTree<vecType, clustererType, distanceType, protoType> kt (m, maxiters);
835
837
kt.setDelayedUpdates (true );
836
838
kt.setUpdateDelay (1000 );
@@ -845,16 +847,24 @@ void clueweb() {
845
847
}
846
848
}
847
849
cout << endl;
848
- cout << " rearranging K-tree" << endl;
850
+ building.stop ();
851
+ cout << " Building K-tree took " << building.elapsed ().wall / 1e9 << " seconds" << endl;
852
+
853
+ // rearrange leaves
854
+ boost::timer::auto_cpu_timer rearranging;
855
+ cout << " Rearranging K-tree" << endl;
849
856
kt.rearrange ();
857
+ cout << " Rearranging K-tree took " << rearranging.elapsed ().wall / 1e9 << " seconds" << endl;
858
+
859
+ // print stats
850
860
all.stop ();
851
861
kt.printStats ();
852
862
double seconds = all.elapsed ().wall / 1e9 ;
853
- cout << " Building K-tree took " << seconds << " seconds" << endl;
863
+ cout << " K-tree took " << seconds << " seconds" << endl;
854
864
}
855
865
856
866
// EM-tree
857
- if (true ) {
867
+ if (false ) {
858
868
int maxiters = 4 ;
859
869
int clusters = 110000 ;
860
870
int m = (int )sqrt (clusters);
@@ -902,52 +912,67 @@ void clueweb() {
902
912
903
913
// TSVQ EM-tree hybrid
904
914
if (false ) {
905
- int clusters = 110000 ;
906
- int m = (int )sqrt (clusters);
907
- int depth = 3 ;
908
- int maxiters = 4 ;
909
- int sampleSize = 2000000 ;
915
+ // record time for all operations
916
+ boost::timer::auto_cpu_timer all;
910
917
911
918
// sample data
919
+ int sampleSize = 2000000 ;
912
920
vector < SVector<bool >*> sample = vectors;
913
921
random_shuffle (sample.begin (), sample.end ());
914
922
sample.resize (sampleSize);
915
-
916
- // record time for all operations
917
- boost::timer::auto_cpu_timer all;
918
-
923
+
919
924
// build TSVQ on sample
925
+ int clusters = 110000 ;
926
+ int m = (int )sqrt (clusters);
927
+ int depth = 3 ;
928
+ int tsvqMaxiters = 5 ;
920
929
boost::timer::auto_cpu_timer tsvqTimer;
921
- TSVQ<vecType, clustererType, distanceType, protoType> tsvq (m, depth, maxiters);
922
- tsvqTimer.start ();
930
+ TSVQ<vecType, clustererType, distanceType, protoType> tsvq (m, depth, tsvqMaxiters);
923
931
tsvq.cluster (sample);
924
932
tsvqTimer.stop ();
925
933
tsvq.printStats ();
926
934
cout << endl << " Building TSVQ on sample took " << tsvqTimer.elapsed ().wall / 1e9 << " seconds" << endl;
927
935
cout << " --------" << endl;
928
936
929
937
// 2 iterations of EM-tree on all data, using TSVQ sample as seed
938
+ int emtreeMaxiters = 2 ;
930
939
EMTree<vecType, clustererType, distanceType, protoType> emtree (tsvq.getMWayTree ());
931
940
boost::timer::auto_cpu_timer emtreeTimer;
932
941
{
933
942
boost::timer::auto_cpu_timer iter;
934
- emtree.EMStep (vectors);
943
+
944
+ // place all data into TSVQ initialized tree
945
+ emtree.replace (vectors);
946
+ cout << " placed all points into TSVQ tree" << endl;
947
+ emtree.printStats ();
948
+ cout << endl << " --------" << endl;
949
+
950
+ // prune
951
+ int pruned = 1 ;
952
+ while (pruned > 0 ) {
953
+ pruned = emtree.prune ();
954
+ }
955
+
956
+ // update means
957
+ emtree.rebuildInternal ();
958
+
959
+ // print stats
935
960
iter.stop ();
936
961
cout << " iteration 1 took " << iter.elapsed ().wall / 1e9 << " seconds" << endl;
937
- cout << " RMSE = " << emtree.getRMSE ();
938
- cout << " --------" << endl;
962
+ emtree.printStats ();
963
+ cout << endl << " --------" << endl;
939
964
}
940
- for (int i = 1 ; i < maxiters ; ++i) {
965
+ for (int i = 1 ; i < emtreeMaxiters ; ++i) {
941
966
boost::timer::auto_cpu_timer iter;
942
967
emtree.EMStep ();
943
968
iter.stop ();
944
969
cout << " iteration " << i + 1 << " took " << iter.elapsed ().wall / 1e9 << " seconds" << endl;
945
- cout << " RMSE = " << emtree.getRMSE ();
970
+ emtree.printStats ();
946
971
cout << " --------" << endl;
947
972
}
948
973
emtreeTimer.stop ();
949
974
emtree.printStats ();
950
- cout << endl << " 2 iterations of EM-tree took " << emtreeTimer.elapsed ().wall / 1e9 << " seconds" << endl;
975
+ cout << endl << emtreeMaxiters << " iterations of EM-tree took " << emtreeTimer.elapsed ().wall / 1e9 << " seconds" << endl;
951
976
952
977
// report all time
953
978
all.stop ();
0 commit comments