Skip to content

Commit e29db28

Browse files
authored
Merge pull request #32 from Arkoniak/hamerly_algorithm
Initial hamerly implementation
2 parents 626dec3 + e9d8832 commit e29db28

File tree

6 files changed

+478
-9
lines changed

6 files changed

+478
-9
lines changed

benchmark/bench01_distance.jl

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@ centroids = rand(10, 2)
1717
d = Vector{Float64}(undef, 100_000)
1818
suite["100kx10"] = @benchmarkable ParallelKMeans.colwise!($d, $X, $centroids)
1919

20-
# for reference
21-
metric = SqEuclidean()
22-
#suite["100kx10_distances"] = @benchmarkable Distances.colwise!($d, $metric, $X, $centroids)
23-
dist = Distances.pairwise(metric, X, centroids, dims = 2)
24-
min = minimum(dist, dims=2)
25-
suite["100kx10_distances"] = @benchmarkable $d = min
2620
end # module
2721

2822
BenchDistance.suite

benchmark/bench02_kmeans.jl

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
module BenchKMeans
2+
using Random
3+
using ParallelKMeans
4+
using BenchmarkTools
5+
6+
suite = BenchmarkGroup()
7+
8+
Random.seed!(2020)
9+
X = rand(10, 100_000)
10+
11+
centroids3 = ParallelKMeans.smart_init(X, 3, 1, init="kmeans++").centroids
12+
centroids10 = ParallelKMeans.smart_init(X, 10, 1, init="kmeans++").centroids
13+
14+
suite["10x100_000x3x1 Lloyd"] = @benchmarkable kmeans($X, 3, init = $centroids3, n_threads = 1, verbose = false, tol = 1e-6, max_iters = 1000)
15+
suite["10x100_000x3x1 Hammerly"] = @benchmarkable kmeans(Hamerly(), $X, 3, init = $centroids3, n_threads = 1, verbose = false, tol = 1e-6, max_iters = 1000)
16+
17+
suite["10x100_000x3x2 Lloyd"] = @benchmarkable kmeans($X, 3, init = $centroids3, n_threads = 2, verbose = false, tol = 1e-6, max_iters = 1000)
18+
suite["10x100_000x3x2 Hammerly"] = @benchmarkable kmeans(Hamerly(), $X, 3, init = $centroids3, n_threads = 2, verbose = false, tol = 1e-6, max_iters = 1000)
19+
20+
suite["10x100_000x10x1 Lloyd"] = @benchmarkable kmeans($X, 10, init = $centroids10, n_threads = 1, verbose = false, tol = 1e-6, max_iters = 1000)
21+
suite["10x100_000x10x1 Hammerly"] = @benchmarkable kmeans(Hamerly(), $X, 10, init = $centroids10, n_threads = 1, verbose = false, tol = 1e-6, max_iters = 1000)
22+
23+
suite["10x100_000x10x2 Lloyd"] = @benchmarkable kmeans($X, 10, init = $centroids10, n_threads = 2, verbose = false, tol = 1e-6, max_iters = 1000)
24+
suite["10x100_000x10x2 Hammerly"] = @benchmarkable kmeans(Hamerly(), $X, 10, init = $centroids10, n_threads = 2, verbose = false, tol = 1e-6, max_iters = 1000)
25+
26+
end # module
27+
28+
BenchKMeans.suite

src/ParallelKMeans.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ include("seeding.jl")
77
include("kmeans.jl")
88
include("lloyd.jl")
99
include("light_elkan.jl")
10+
include("hamerly.jl")
1011

1112
export kmeans
12-
export Lloyd, LightElkan
13+
export Lloyd, LightElkan, Hamerly
1314

1415
end # module

src/hamerly.jl

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
struct Hamerly <: AbstractKMeansAlg end
2+
3+
function kmeans(alg::Hamerly, design_matrix, k;
4+
n_threads = Threads.nthreads(),
5+
k_init = "k-means++", max_iters = 300,
6+
tol = 1e-6, verbose = true, init = nothing)
7+
nrow, ncol = size(design_matrix)
8+
containers = create_containers(alg, k, nrow, ncol, n_threads)
9+
10+
return kmeans!(alg, containers, design_matrix, k, n_threads = n_threads,
11+
k_init = k_init, max_iters = max_iters, tol = tol,
12+
verbose = verbose, init = init)
13+
end
14+
15+
function kmeans!(alg::Hamerly, containers, design_matrix, k;
16+
n_threads = Threads.nthreads(),
17+
k_init = "k-means++", max_iters = 300,
18+
tol = 1e-6, verbose = true, init = nothing)
19+
nrow, ncol = size(design_matrix)
20+
centroids = init == nothing ? smart_init(design_matrix, k, n_threads, init=k_init).centroids : deepcopy(init)
21+
22+
@parallelize n_threads ncol chunk_initialize!(alg, containers, centroids, design_matrix)
23+
24+
converged = false
25+
niters = 1
26+
J_previous = 0.0
27+
p = containers.p
28+
29+
# Update centroids & labels with closest members until convergence
30+
while niters <= max_iters
31+
update_containers!(containers, alg, centroids, n_threads)
32+
@parallelize n_threads ncol chunk_update_centroids!(centroids, containers, alg, design_matrix)
33+
collect_containers(alg, containers, n_threads)
34+
35+
J = sum(containers.ub)
36+
move_centers!(centroids, containers, alg)
37+
38+
r1, r2, pr1, pr2 = double_argmax(p)
39+
@parallelize n_threads ncol chunk_update_bounds!(containers, r1, r2, pr1, pr2)
40+
41+
if verbose
42+
# Show progress and terminate if J stopped decreasing.
43+
println("Iteration $niters: Jclust = $J")
44+
end
45+
46+
# Check for convergence
47+
if (niters > 1) & (abs(J - J_previous) < (tol * J))
48+
converged = true
49+
break
50+
end
51+
52+
J_previous = J
53+
niters += 1
54+
end
55+
56+
@parallelize n_threads ncol sum_of_squares(containers, design_matrix, containers.labels, centroids)
57+
totalcost = sum(containers.sum_of_squares)
58+
59+
# Terminate algorithm with the assumption that K-means has converged
60+
if verbose & converged
61+
println("Successfully terminated with convergence.")
62+
end
63+
64+
# TODO empty placeholder vectors should be calculated
65+
# TODO Float64 type definitions is too restrictive, should be relaxed
66+
# especially during GPU related development
67+
return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
68+
end
69+
70+
function collect_containers(alg::Hamerly, containers, n_threads)
71+
if n_threads == 1
72+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1] ./ containers.centroids_cnt[1]'
73+
else
74+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1]
75+
@inbounds containers.centroids_cnt[end] .= containers.centroids_cnt[1]
76+
@inbounds for i in 2:n_threads
77+
containers.centroids_new[end] .+= containers.centroids_new[i]
78+
containers.centroids_cnt[end] .+= containers.centroids_cnt[i]
79+
end
80+
81+
@inbounds containers.centroids_new[end] .= containers.centroids_new[end] ./ containers.centroids_cnt[end]'
82+
end
83+
end
84+
85+
function create_containers(alg::Hamerly, k, nrow, ncol, n_threads)
86+
lng = n_threads + 1
87+
centroids_new = Vector{Array{Float64,2}}(undef, lng)
88+
centroids_cnt = Vector{Vector{Int}}(undef, lng)
89+
90+
for i = 1:lng
91+
centroids_new[i] = zeros(nrow, k)
92+
centroids_cnt[i] = zeros(k)
93+
end
94+
95+
# Upper bound to the closest center
96+
ub = Vector{Float64}(undef, ncol)
97+
98+
# lower bound to the second closest center
99+
lb = Vector{Float64}(undef, ncol)
100+
101+
labels = zeros(Int, ncol)
102+
103+
# distance that centroid moved
104+
p = Vector{Float64}(undef, k)
105+
106+
# distance from the center to the closest other center
107+
s = Vector{Float64}(undef, k)
108+
109+
# total_sum_calculation
110+
sum_of_squares = Vector{Float64}(undef, n_threads)
111+
112+
return (
113+
centroids_new = centroids_new,
114+
centroids_cnt = centroids_cnt,
115+
labels = labels,
116+
ub = ub,
117+
lb = lb,
118+
p = p,
119+
s = s,
120+
sum_of_squares = sum_of_squares
121+
)
122+
end
123+
124+
"""
125+
chunk_initialize!(alg::Hamerly, containers, centroids, design_matrix, r, idx)
126+
127+
Initial calulation of all bounds and points labeling.
128+
"""
129+
function chunk_initialize!(alg::Hamerly, containers, centroids, design_matrix, r, idx)
130+
centroids_cnt = containers.centroids_cnt[idx]
131+
centroids_new = containers.centroids_new[idx]
132+
133+
@inbounds for i in r
134+
label = point_all_centers!(containers, centroids, design_matrix, i)
135+
centroids_cnt[label] += 1
136+
for j in axes(design_matrix, 1)
137+
centroids_new[j, label] += design_matrix[j, i]
138+
end
139+
end
140+
end
141+
142+
"""
143+
update_containers!(containers, ::Hamerly, centroids, n_threads)
144+
145+
Calculates minimum distances from centers to each other.
146+
"""
147+
function update_containers!(containers, ::Hamerly, centroids, n_threads)
148+
s = containers.s
149+
s .= Inf
150+
@inbounds for i in axes(centroids, 2)
151+
for j in i+1:size(centroids, 2)
152+
d = distance(centroids, centroids, i, j)
153+
d = 0.25*d
154+
s[i] = s[i] > d ? d : s[i]
155+
s[j] = s[j] > d ? d : s[j]
156+
end
157+
end
158+
end
159+
160+
"""
161+
chunk_update_centroids!(centroids, containers, alg::Hamerly, design_matrix, r, idx)
162+
163+
Detailed description of this function can be found in the original paper. It iterates through
164+
all points and tries to skip some calculation using known upper and lower bounds of distances
165+
from point to centers. If it fails to skip than it fall back to generic `point_all_centers!` function.
166+
"""
167+
function chunk_update_centroids!(centroids, containers, alg::Hamerly, design_matrix, r, idx)
168+
169+
# unpack containers for easier manipulations
170+
centroids_new = containers.centroids_new[idx]
171+
centroids_cnt = containers.centroids_cnt[idx]
172+
labels = containers.labels
173+
s = containers.s
174+
lb = containers.lb
175+
ub = containers.ub
176+
177+
@inbounds for i in r
178+
# m ← max(s(a(i))/2, l(i))
179+
m = max(s[labels[i]], lb[i])
180+
# first bound test
181+
if ub[i] > m
182+
# tighten upper bound
183+
label = labels[i]
184+
ub[i] = distance(design_matrix, centroids, i, label)
185+
# second bound test
186+
if ub[i] > m
187+
label_new = point_all_centers!(containers, centroids, design_matrix, i)
188+
if label != label_new
189+
labels[i] = label_new
190+
centroids_cnt[label_new] += 1
191+
centroids_cnt[label] -= 1
192+
for j in axes(design_matrix, 1)
193+
centroids_new[j, label_new] += design_matrix[j, i]
194+
centroids_new[j, label] -= design_matrix[j, i]
195+
end
196+
end
197+
end
198+
end
199+
end
200+
end
201+
202+
"""
203+
point_all_centers!(containers, centroids, design_matrix, i)
204+
205+
Calculates new labels and upper and lower bounds for all points.
206+
"""
207+
function point_all_centers!(containers, centroids, design_matrix, i)
208+
ub = containers.ub
209+
lb = containers.lb
210+
labels = containers.labels
211+
212+
min_distance = Inf
213+
min_distance2 = Inf
214+
label = 1
215+
@inbounds for k in axes(centroids, 2)
216+
dist = distance(design_matrix, centroids, i, k)
217+
if min_distance > dist
218+
label = k
219+
min_distance2 = min_distance
220+
min_distance = dist
221+
elseif min_distance2 > dist
222+
min_distance2 = dist
223+
end
224+
end
225+
226+
ub[i] = min_distance
227+
lb[i] = min_distance2
228+
labels[i] = label
229+
230+
return label
231+
end
232+
233+
"""
234+
move_centers!(centroids, containers, ::Hamerly)
235+
236+
Calculates new positions of centers and distance they have moved. Results are stored
237+
in `centroids` and `p` respectively.
238+
"""
239+
function move_centers!(centroids, containers, ::Hamerly)
240+
centroids_new = containers.centroids_new[end]
241+
p = containers.p
242+
243+
@inbounds for i in axes(centroids, 2)
244+
d = 0.0
245+
for j in axes(centroids, 1)
246+
d += (centroids[j, i] - centroids_new[j, i])^2
247+
centroids[j, i] = centroids_new[j, i]
248+
end
249+
p[i] = d
250+
end
251+
end
252+
253+
"""
254+
chunk_update_bounds!(containers, r1, r2, pr1, pr2, r, idx)
255+
256+
Updates upper and lower bounds of point distance to the centers, with regard to the centers movement.
257+
Since bounds are squred distance, `sqrt` is used to make corresponding estimation, unlike
258+
the original paper, where usual metric is used.
259+
260+
Using notation from original paper, `u` is upper bound and `a` is `labels`, so
261+
262+
`u[i] -> u[i] + p[a[i]]`
263+
264+
then squared distance is
265+
266+
`u[i]^2 -> (u[i] + p[a[i]])^2 = u[i]^2 + 2 p[a[i]] u[i] + p[a[i]]^2`
267+
268+
Taking into account that in our noations `p^2 -> p`, `u^2 -> ub` we obtain
269+
270+
`ub[i] -> ub[i] + 2 sqrt(p[a[i]] ub[i]) + p[a[i]]`
271+
272+
The same applies to the lower bounds.
273+
"""
274+
function chunk_update_bounds!(containers, r1, r2, pr1, pr2, r, idx)
275+
p = containers.p
276+
ub = containers.ub
277+
lb = containers.lb
278+
labels = containers.labels
279+
280+
@inbounds for i in r
281+
label = labels[i]
282+
ub[i] += 2*sqrt(abs(ub[i] * p[label])) + p[label]
283+
if r1 == label
284+
lb[i] += pr2 - 2*sqrt(abs(pr2*lb[i]))
285+
else
286+
lb[i] += pr1 - 2*sqrt(abs(pr1*lb[i]))
287+
end
288+
end
289+
end
290+
291+
"""
292+
double_argmax(p)
293+
294+
Finds maximum and next after maximum arguments.
295+
"""
296+
function double_argmax(p)
297+
r1, r2 = 1, 1
298+
d1 = p[1]
299+
d2 = -1.0
300+
for i in 2:length(p)
301+
if p[i] > d1
302+
r2 = r1
303+
r1 = i
304+
d2 = d1
305+
d1 = p[i]
306+
elseif p[i] > d2
307+
d2 = p[i]
308+
r2 = i
309+
end
310+
end
311+
312+
r1, r2, d1, d2
313+
end

0 commit comments

Comments
 (0)