small bug fixes & TODO requests

PyDataBlog · PyDataBlog · commit a54432a200ad · 2020-04-03T21:17:40.000+02:00
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -52,7 +52,7 @@ git checkout experimental
 
 
 ## Pending Features
-- [ ] Implementation of Hamerly implementation. 
+- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster). 
 - [ ] Full Implementation of Triangle inequality based on [Elkan C. (2003) "Using the Triangle Inequality to Accelerate
 K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
 - [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
@@ -81,10 +81,10 @@ results = kmeans(X, 3; n_threads=1, max_iters=300)
 The main design goal is to offer all available variations of the KMeans algorithm to end users as composable elements. By default, Lloyd's implementation is used but users can specify different variations of the KMeans clustering algorithm via this interface
 
 ```julia
-some_results = kmeans([algo], data_matrix, k; kwargs)
+some_results = kmeans([algo], input_matrix, k; kwargs)
 
 # example
-r = kmeans(Lloyd(), X, 4)  # same result as the default 
+r = kmeans(Lloyd(), X, 3)  # same result as the default 
 ```
 
 ### Supported KMeans algorithm variations.
@@ -143,6 +143,8 @@ Currently, this package is benchmarked against similar implementation in both Py
 Currently, the benchmark speed tests are based on the search for optimal number of clusters using the [Elbow Method](https://en.wikipedia.org/wiki/Elbow_method_(clustering)) since this is a practical use case for most practioners employing the K-Means algorithm. 
 
 
+<!-- Insert Benchmark Plot Right Below -->
+
 
 |      Package      | Language |             Input Data            | Execution Time |
 |:-----------------:|:--------:|:---------------------------------:|:--------------:|
@@ -161,7 +163,7 @@ Ultimately, we see this package as potentially the one stop shop for everything
 
 Detailed contribution guidelines will be added in upcoming releases.
 
-<!--- Insert Contribution Guidelines --->
+<!--- Insert Contribution Guidelines Below --->
 
 ```@index
 ```
diff --git a/extras/ClusteringJL & ParallelKMeans Benchmarks Final.ipynb b/extras/ClusteringJL & ParallelKMeans Benchmarks Final.ipynb
@@ -31,7 +31,16 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "┌ Info: Precompiling ParallelKMeans [42b8e9d4-006b-409a-8472-7f34b3fb58af]\n",
+      "└ @ Base loading.jl:1260\n"
+     ]
+    }
+   ],
    "source": [
     "# Load Packages\n",
     "using Clustering\n",
@@ -99,164 +108,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BenchmarkTools.Trial: \n",
-       "  memory estimate:  29.06 GiB\n",
-       "  allocs estimate:  27820\n",
-       "  --------------\n",
-       "  minimum time:     486.203 s (0.57% GC)\n",
-       "  median time:      620.239 s (0.53% GC)\n",
-       "  mean time:        604.342 s (0.55% GC)\n",
-       "  maximum time:     681.707 s (0.53% GC)\n",
-       "  --------------\n",
-       "  samples:          7\n",
-       "  evals/sample:     1"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "@benchmark [Clustering.kmeans(X_1m, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] samples=7 seconds=6000"
+    "@btime [Clustering.kmeans(X_1m, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BenchmarkTools.Trial: \n",
-       "  memory estimate:  2.39 GiB\n",
-       "  allocs estimate:  22563\n",
-       "  --------------\n",
-       "  minimum time:     38.106 s (0.58% GC)\n",
-       "  median time:      42.316 s (0.55% GC)\n",
-       "  mean time:        42.721 s (0.54% GC)\n",
-       "  maximum time:     48.713 s (0.48% GC)\n",
-       "  --------------\n",
-       "  samples:          7\n",
-       "  evals/sample:     1"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "@benchmark [Clustering.kmeans(X_100k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] samples=7 seconds=3000"
+    "@btime [Clustering.kmeans(X_100k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "InterruptException",
-     "evalue": "InterruptException:",
-     "output_type": "error",
-     "traceback": [
-      "InterruptException:",
-      "",
-      "Stacktrace:",
-      " [1] Array at ./boot.jl:407 [inlined]",
-      " [2] Array at ./boot.jl:415 [inlined]",
-      " [3] similar at ./array.jl:361 [inlined]",
-      " [4] similar at ./abstractarray.jl:634 [inlined]",
-      " [5] reducedim_initarray at ./reducedim.jl:92 [inlined]",
-      " [6] reducedim_initarray at ./reducedim.jl:93 [inlined]",
-      " [7] reducedim_init at ./reducedim.jl:172 [inlined]",
-      " [8] _mapreduce_dim at ./reducedim.jl:317 [inlined]",
-      " [9] #mapreduce#580 at ./reducedim.jl:307 [inlined]",
-      " [10] _sum at ./reducedim.jl:679 [inlined]",
-      " [11] #sum#584 at ./reducedim.jl:653 [inlined]",
-      " [12] _pairwise!(::Array{Float64,2}, ::Distances.SqEuclidean, ::Array{Float64,2}, ::Array{Float64,2}) at /Users/mysterio/.julia/packages/Distances/jwhuc/src/metrics.jl:563",
-      " [13] pairwise!(::Array{Float64,2}, ::Distances.SqEuclidean, ::Array{Float64,2}, ::Array{Float64,2}; dims::Int64) at /Users/mysterio/.julia/packages/Distances/jwhuc/src/generic.jl:166",
-      " [14] _kmeans!(::Array{Float64,2}, ::Nothing, ::Array{Float64,2}, ::Int64, ::Float64, ::Int64, ::Distances.SqEuclidean) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/kmeans.jl:169",
-      " [15] kmeans!(::Array{Float64,2}, ::Array{Float64,2}; weights::Nothing, maxiter::Int64, tol::Float64, display::Symbol, distance::Distances.SqEuclidean) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/kmeans.jl:70",
-      " [16] kmeans(::Array{Float64,2}, ::Int64; weights::Nothing, init::Symbol, maxiter::Int64, tol::Float64, display::Symbol, distance::Distances.SqEuclidean) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/kmeans.jl:112",
-      " [17] (::var\"#9#11\")(::Int64) at ./none:0",
-      " [18] iterate at ./generator.jl:47 [inlined]",
-      " [19] collect_to!(::Array{Float64,1}, ::Base.Generator{UnitRange{Int64},var\"#9#11\"}, ::Int64, ::Int64) at ./array.jl:710",
-      " [20] collect_to_with_first!(::Array{Float64,1}, ::Float64, ::Base.Generator{UnitRange{Int64},var\"#9#11\"}, ::Int64) at ./array.jl:689",
-      " [21] collect(::Base.Generator{UnitRange{Int64},var\"#9#11\"}) at ./array.jl:670",
-      " [22] ##core#264() at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:371",
-      " [23] ##sample#265(::BenchmarkTools.Parameters) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:379",
-      " [24] sample at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:394 [inlined]",
-      " [25] _lineartrial(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#263\")}, ::BenchmarkTools.Parameters; maxevals::Int64, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:133",
-      " [26] _lineartrial(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#263\")}, ::BenchmarkTools.Parameters) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:125",
-      " [27] #invokelatest#1 at ./essentials.jl:712 [inlined]",
-      " [28] invokelatest at ./essentials.jl:711 [inlined]",
-      " [29] #lineartrial#38 at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:33 [inlined]",
-      " [30] lineartrial at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:33 [inlined]",
-      " [31] tune!(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#263\")}, ::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:209",
-      " [32] tune! at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:208 [inlined] (repeats 2 times)",
-      " [33] top-level scope at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:288",
-      " [34] top-level scope at In[9]:1"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "@benchmark [Clustering.kmeans(X_10k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] samples=7 seconds=1200"
+    "@btime [Clustering.kmeans(X_10k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "InterruptException",
-     "evalue": "InterruptException:",
-     "output_type": "error",
-     "traceback": [
-      "InterruptException:",
-      "",
-      "Stacktrace:",
-      " [1] Weights at /Users/mysterio/.julia/packages/StatsBase/Q9jSr/src/weights.jl:13 [inlined] (repeats 2 times)",
-      " [2] Weights at /Users/mysterio/.julia/packages/StatsBase/Q9jSr/src/weights.jl:16 [inlined]",
-      " [3] weights at /Users/mysterio/.julia/packages/StatsBase/Q9jSr/src/weights.jl:76 [inlined]",
-      " [4] wsample(::Random._GLOBAL_RNG, ::UnitRange{Int64}, ::Array{Float64,1}) at /Users/mysterio/.julia/packages/StatsBase/Q9jSr/src/sampling.jl:829",
-      " [5] wsample at /Users/mysterio/.julia/packages/StatsBase/Q9jSr/src/sampling.jl:830 [inlined]",
-      " [6] initseeds!(::Array{Int64,1}, ::KmppAlg, ::Array{Float64,2}, ::Distances.SqEuclidean) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/seeding.jl:176",
-      " [7] initseeds! at /Users/mysterio/.julia/packages/Clustering/uj53P/src/seeding.jl:161 [inlined]",
-      " [8] initseeds at /Users/mysterio/.julia/packages/Clustering/uj53P/src/seeding.jl:42 [inlined]",
-      " [9] initseeds(::Symbol, ::Array{Float64,2}, ::Int64) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/seeding.jl:74",
-      " [10] kmeans(::Array{Float64,2}, ::Int64; weights::Nothing, init::Symbol, maxiter::Int64, tol::Float64, display::Symbol, distance::Distances.SqEuclidean) at /Users/mysterio/.julia/packages/Clustering/uj53P/src/kmeans.jl:109",
-      " [11] (::var\"#12#14\")(::Int64) at ./none:0",
-      " [12] iterate at ./generator.jl:47 [inlined]",
-      " [13] collect_to!(::Array{Float64,1}, ::Base.Generator{UnitRange{Int64},var\"#12#14\"}, ::Int64, ::Int64) at ./array.jl:710",
-      " [14] collect_to_with_first!(::Array{Float64,1}, ::Float64, ::Base.Generator{UnitRange{Int64},var\"#12#14\"}, ::Int64) at ./array.jl:689",
-      " [15] collect(::Base.Generator{UnitRange{Int64},var\"#12#14\"}) at ./array.jl:670",
-      " [16] ##core#268() at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:371",
-      " [17] ##sample#269(::BenchmarkTools.Parameters) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:379",
-      " [18] sample at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:394 [inlined]",
-      " [19] _lineartrial(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#267\")}, ::BenchmarkTools.Parameters; maxevals::Int64, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:133",
-      " [20] _lineartrial(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#267\")}, ::BenchmarkTools.Parameters) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:125",
-      " [21] #invokelatest#1 at ./essentials.jl:712 [inlined]",
-      " [22] invokelatest at ./essentials.jl:711 [inlined]",
-      " [23] #lineartrial#38 at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:33 [inlined]",
-      " [24] lineartrial at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:33 [inlined]",
-      " [25] tune!(::BenchmarkTools.Benchmark{Symbol(\"##benchmark#267\")}, ::BenchmarkTools.Parameters; progressid::Nothing, nleaves::Float64, ndone::Float64, verbose::Bool, pad::String, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:209",
-      " [26] tune! at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:208 [inlined] (repeats 2 times)",
-      " [27] top-level scope at /Users/mysterio/.julia/packages/BenchmarkTools/eCEpo/src/execution.jl:288",
-      " [28] top-level scope at In[10]:1"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "@benchmark [Clustering.kmeans(X_1k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] samples=7 seconds=1200"
+    "@btime [Clustering.kmeans(X_1k, i; tol=1e-6, maxiter=1000).totalcost for i = 2:10] "
    ]
   },
   {
@@ -286,9 +169,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Lloyd(), X_1m, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=2000"
+    "@btime [ParallelKMeans.kmeans(Lloyd(), X_1m, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
@@ -297,9 +178,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Lloyd(), X_100k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Lloyd(), X_100k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
@@ -308,9 +187,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Lloyd(), X_10k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Lloyd(), X_10k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
@@ -319,9 +196,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Lloyd(), X_1k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Lloyd(), X_1k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
@@ -332,39 +207,39 @@
    ]
   },
   {
-   "cell_type": "raw",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Hamerly(), X_1m, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Hamerly(), X_1m, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
-   "cell_type": "raw",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Hamerly(), X_100k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Hamerly(), X_100k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
-   "cell_type": "raw",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Hamerly(), X_10k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@btime [ParallelKMeans.kmeans(Hamerly(), X_10k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10]"
    ]
   },
   {
-   "cell_type": "raw",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "@benchmark [ParallelKMeans.kmeans(Hamerly(), X_1k, i;\n",
-    "            tol=1e-6, max_iters=1000, verbose=false).totalcost \n",
-    "            for i = 2:10] samples=7 seconds=1200"
+    "@benchmark [ParallelKMeans.kmeans(Hamerly(), X_1k, i; tol=1e-6, max_iters=1000, verbose=false).totalcost for i = 2:10] samples=7 seconds=1200"
    ]
   }
  ],
@@ -379,6 +254,9 @@
    "mimetype": "application/julia",
    "name": "julia",
    "version": "1.4.0"
+  },
+  "nteract": {
+   "version": "0.22.4"
   }
  },
  "nbformat": 4,
diff --git a/src/hamerly.jl b/src/hamerly.jl