forked from bostxavier/Serial-Speakers
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_hash_collisions.py
More file actions
65 lines (59 loc) · 1.92 KB
/
plot_hash_collisions.py
File metadata and controls
65 lines (59 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import argparse
import pathlib as pl
from collections import defaultdict
from statistics import mean
from tqdm import tqdm
import matplotlib.pyplot as plt
import scienceplots
from novelties_bookshare.encrypt import encrypt_tokens
from novelties_bookshare.experiments.data import load_book, EDITION_SETS
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output-file", type=pl.Path, default=None)
parser.add_argument("-l", "--log-scale", action="store_true")
args = parser.parse_args()
tokens = []
for novel, edition_sets in EDITION_SETS.items():
for path in edition_sets.values():
tokens += load_book(path)
X = list(range(1, 65))
Y = []
for hash_len in tqdm(X, ascii=True):
hash2tokens = defaultdict(set)
encrypted = encrypt_tokens(tokens, hash_len=hash_len)
for e, token in zip(encrypted, tokens):
hash2tokens[e].add(token)
Y.append(mean(len(v) - 1 for v in hash2tokens.values()))
print(Y)
assert all(Y[i] >= Y[i + 1] for i in range(len(Y) - 1))
Y = [value for value in Y if value > 0.01]
plt.style.use("science")
plt.rcParams.update({"font.size": 10})
X = [i + 1 for i in range(len(Y))]
plt.plot(X, Y, linewidth=1, marker="*", markersize=8)
ax = plt.gca()
if args.log_scale:
ax.set_yscale("log")
for x, y in zip(X, Y):
ax.annotate(
f"{y:.2f}",
(x, y),
xytext=(0, 5),
textcoords="offset points",
ha="center",
va="bottom",
fontsize=8,
)
ax.grid()
plt.ylabel(
"Mean coll. per token" + " (log)" if args.log_scale else "",
fontsize=10,
)
plt.xlabel("Hash length")
plt.tight_layout()
fig = plt.gcf()
fig.set_size_inches(4, 2)
if not args.output_file is None:
plt.savefig(args.output_file)
else:
plt.show()