forked from gnyers/python-tuesday
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprofiler.2
More file actions
executable file
·156 lines (126 loc) · 6.21 KB
/
profiler.2
File metadata and controls
executable file
·156 lines (126 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3
'''Recursively analyze a directory structure and provide some stats about the
disk space usage of the entire content, the number of files, directories, hard-
en symlinks.
Usage:
./profiler exampledir
Example output of a simple directory structure:
--- Statistics of directory: exampledir/
Number of errors encountered while processing 0
The number of dirs: 6
The number of non-dirs (i.e.: files, hard- and symlinks): 8
The number of symlinks: 1
The number of hardlinks: 3
Used disk space: 524 bytes
Which of the hardlinks point to the same file?
exampledir/file1.dat, exampledir/a/file2.bin, exampledir/c/e/file15 : 7 bytes
'''
import argparse
import os
import sys
from os.path import join, isdir, isfile, islink
# Program details
__author__ = 'Gábor Nyers'
__version__ = '0.0.1'
__date__ = '2022-05-31'
__license__ = 'GPLv3'
version_info = tuple(__version__.split('.'))
def getattrs(path):
'''Return a customized set of attributes of path
'''
attrs = os.lstat(path)
ret = { 'isdir': isdir(path), # is it a directory? True/False
'isfile': isfile(path), # is it a file?
# NOTE: also True if hard- or symlink!
'issymlink': islink(path), # is it a symlink?
'refcount': attrs.st_nlink, # how many filenames to same inode?
# NOTE: >1 if hardlink!
'size': attrs.st_size, # the nr. of bytes of disk space consumed
'inode': attrs.st_ino, # the inode's **unique** id
# NOTE: find out names pointing to same file
}
# Hardlinks are files that have multiple names, in the same or different
# directory
if ret['isfile'] and ret['refcount'] > 1:
ret['ishardlink'] = True
return ret
def valid_dir(dirname):
''' a validator function for directory names
returns: `dirname` if directory exists and is indeed a directory
raises: argparse.ArgumentTypeError dirname
'''
if isdir(dirname): # is dirname a valid directory?
return dirname # return dirname as is
else: # do not accept an invalid dir
raise argparse.ArgumentTypeError(f'{dirname} is not a directory')
def parseargs(cmdline=sys.argv[1:], # parse either CLI args or a string
known_args_only=False, # fail if unknown args?
description=__doc__, # --help begins with the docstring
epilog="That's all folks!" # --help ends with this string
):
p = argparse.ArgumentParser() # get an ArgumentParser instance
p.formatter_class=argparse.RawTextHelpFormatter
p.description, p.epilog = description, epilog
p.add_argument('dirname', # name of this argument
metavar='DIR', # --help will show this as arg name
type=valid_dir, # validator function
help='path to directory to analyze') # purpose of this arg
if known_args_only:
return p.parse_known_args(cmdline)[0] # want only known args
else:
return p.parse_args(cmdline) # parse all args!
# Initialize stuff
args = parseargs() # begin of the prg, lets parse args!
errors = {}
total_size = 0
startdir = args.dirname # get CLI argument 'DIR'
data = {} # data structure
# recursively iterate through the `startdir`,
# **whithout** following symbolic links
for path, subdirs, files in os.walk(startdir, followlinks=False):
data[path] = getattrs(path)
data[path].update({'nr_of_nondirs': len(files)}) # how many non-dir
# objects in curr. dir?
sizes = 0
for f in files: # nested loop to inspect dir. objects
try: # in case somthing goes wrong...
fpath = join(path, f)
data[fpath] = getattrs(fpath)
sizes += data[fpath]['size']
except Exception as e: # ... handle any run-time errors
errors[fpath] = e.args # remember which file and what error
data[path].update({'disk_usage': sizes}) # how many non-dir
### For debug purposes: uncomment to see what the content is of `data` is
# for p, attrs in data.items():
# print(f'{p}: {attrs}')
### Print results:
print(f'--- Statistics of directory: {startdir}')
print(f'Number of errors encountered while processing {len(errors)}')
print(f' {", ".join(errors.keys())}')
dirs = [ path for path,attrs in data.items() if attrs['isdir'] ]
print(f'The number of dirs: {len(dirs)}')
files = [ path for path,attrs in data.items() if not(attrs['isdir']) ]
print(f'The number of non-dirs (i.e.: files, hard- and symlinks): {len(files)}')
symlinks = [ path for path,attrs in data.items() if attrs.get('issymlink') ]
print(f'The number of symlinks: {len(symlinks)}')
hardlinks = [ path for path,attrs in data.items() if attrs.get('ishardlink') ]
print(f'The number of hardlinks: {len(hardlinks)}')
total_disk_usage = sum([ attrs.get('disk_usage',0)
for path,attrs in data.items()
if attrs['isdir']
])
print(f'Used disk space: {total_disk_usage} bytes')
# build a dict where the elemens are: inode: path # new: 1
deduped_non_dir_inodes = {attrs['inode']: path # new: 1
for path, attrs in data.items() # new: 1
if not attrs['isdir'] } # new: 1
total_disk_usage_corrected = sum([ data[path]['size'] # new: 1
for inode,path in deduped_non_dir_inodes.items()]) # new: 1
print(f'Used disk space (corrected): {total_disk_usage_corrected} bytes') # new: 1
print('Which of the hardlinks point to the same file? ')
distict_files = {}
for path in hardlinks:
distict_files.setdefault(data[path]['inode'], []).append(path)
for inode, paths in distict_files.items():
fsize = data[paths[0]]['size']
print(f' {", ".join(paths)} : {fsize} bytes')