-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrepack_emd.py
More file actions
73 lines (58 loc) · 2.42 KB
/
repack_emd.py
File metadata and controls
73 lines (58 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# repack_emd.py by Takanori Nakane
# Verlox's EMD file is in the HDF5 container but uncompressed.
# Typical MicroED data can be compressed to less than 50 % of the original size.
# Unfortunately, compression by the `h5repack` command is very slow due to
# inefficient access pattern, as discussed in https://forum.hdfgroup.org/t/speeding-up-h5repack/1145.
# This script reads data in a more efficient way and compresses about 40 times faster!
import h5py
import numpy as np
import os
import sys
import zlib
compression_level = 4
if len(sys.argv) != 3:
print("Usage: dials.python repack_emd.py input.h5 output.h5")
sys.exit(-1)
src_filename = sys.argv[1]
dst_filename = sys.argv[2]
tmp_filename = dst_filename + ".tmp"
if (os.path.exists(dst_filename)):
print("The output file %s already exists. Exiting..." % dst_filename)
sys.exit(0)
if (os.path.exists(tmp_filename)):
os.remove(tmp_filename)
fin = h5py.File(src_filename, "r")
fout = h5py.File(tmp_filename, "w")
def compress_by_frames(src, dst):
nframes = src.shape[-1]
other_dims = src.shape[0:-1]
print(src.name, src.shape, nframes, other_dims)
# We can use src.name, which contains the FULL path.
# Group.create_dataset accepts a full path, which is relative to the file root,
# not the group root.
dst_dataset = dst.create_dataset(src.name, src.shape, chunks=(*other_dims, 1),
compression="gzip", shuffle=True, dtype=src.dtype)
for (name, attr) in src.attrs:
dst_dataset.attrs[name] = attr
for i in range(nframes):
my_chunk = src[..., i].view(dtype=np.uint8)
itemsize = src.dtype.itemsize
shuffled = my_chunk.reshape((-1, itemsize)).transpose().reshape(-1)
compressed = zlib.compress(shuffled.tobytes(), compression_level)
dst_dataset.id.write_direct_chunk(offsets=(*((0, ) * len(other_dims)), i), data=compressed, filter_mask=0)
def doit(src, dst):
for (name, src_child) in src.items():
if src_child.__class__ is h5py.Dataset:
if name in ["Data", "Metadata"]:
compress_by_frames(src_child, dst)
else:
# This needs h5py >= 3.0.
# See https://github.com/h5py/h5py/issues/1005.
src.copy(src_child, dst)
else:
dst.create_group(name)
doit(src_child, dst[name])
doit(fin, fout)
fin.close()
fout.close()
os.rename(tmp_filename, dst_filename)