3
3
4
4
GITHUB_TOKEN = os .getenv ("GIST_TOKEN" )
5
5
GIST_ID = "c9112c25c5acd400b90741efa81aa411"
6
+ USE_TRUE_SIZE = True
6
7
7
8
g = Github (GITHUB_TOKEN )
8
9
gist = g .get_gist (GIST_ID )
9
10
10
11
# Format the filesize to unit'ed format
11
12
def format_bytes (num_bytes ):
12
13
units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
13
- factor = 1000
14
+ factor = 1024
14
15
unit_index = 0
15
16
while num_bytes >= factor and unit_index < len (units )- 1 :
16
17
num_bytes /= factor
@@ -21,7 +22,7 @@ def format_bytes(num_bytes):
21
22
def unformat_bytes (string ):
22
23
units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
23
24
num ,unit = string .split (" " )
24
- factor = 1000
25
+ factor = 1024
25
26
return float (num )* (factor ** (units .index (unit )))
26
27
27
28
dir = "_datasets"
@@ -70,7 +71,24 @@ def unformat_bytes(string):
70
71
71
72
# Record all the identifiers to the filename
72
73
search_map [filename ].append (f'{ username } /{ dataset } ' )
73
-
74
+
75
+ # remove sharma kaggle model
76
+ search_map ["sharma2024.md" ] = [x for x in search_map ["sharma2024.md" ] if x != 'sharmapushan/pimapnet' ]
77
+ usernames ["sharmapushan" ] = [x for x in usernames ["sharmapushan" ] if x .ref != 'sharmapushan/pimapnet' ]
78
+
79
+ # read true dataset sizes from json file
80
+ try :
81
+ datasets_size = json .loads (gist .files ['datasets_size.json' ].content )
82
+ print (f"Read { len (datasets_size )} dataset sizes from gist" )
83
+ except Exception as e :
84
+ print (f'Could not read datasets_size.json from gist: { e } ' )
85
+ if os .path .exists ('datasets_size.json' ):
86
+ with open ('datasets_size.json' ,'r' ) as f :
87
+ datasets_size = json .load (f )
88
+ print ('Loading datasets_size.json from local file...' )
89
+ else :
90
+ raise Exception ("No datasets_size.json found" )
91
+
74
92
# At this point we have done all the necessary scraping from Kaggle API calls
75
93
for filename in search_map :
76
94
dataset_names = search_map [filename ]
@@ -81,53 +99,55 @@ def unformat_bytes(string):
81
99
82
100
for dsn in dataset_names :
83
101
print (f'Processing { dsn } ...' )
84
- # Old Kaggle Api < 1.7
102
+ # New Kaggle Api >= 1.7
85
103
try :
86
- user = dsn .split ("/" )[0 ]
87
- dataset = vars (next ((d for d in usernames [user ] if vars (d )['ref' ] == dsn )))
88
- downloads .append (int (dataset ['downloadCount' ]))
89
- views .append (int (dataset ['viewCount' ]))
90
- sizes .append (int (dataset ['totalBytes' ]))
104
+ user , dataset_id = dsn .split ("/" )
105
+ dataset = next ((d for d in usernames [user ] if d .ref == dsn ))
106
+ downloads .append (int (dataset .download_count ))
107
+ views .append (int (dataset .view_count ))
108
+ if USE_TRUE_SIZE :
109
+ # Use the true size from the json file
110
+ if dataset_id in datasets_size .keys ():
111
+ sizes .append (int (datasets_size [dataset_id ]))
112
+ else :
113
+ raise Exception (f"Dataset { dataset_id } not found in datasets_size.json" )
114
+ else :
115
+ sizes .append (int (dataset .total_bytes ))
91
116
print (f'{ dsn } done.' )
92
-
93
- # New Kaggle Api >=1.7
94
117
except KeyError :
118
+ # Old Kaggle Api <1.7
95
119
try :
96
120
user = dsn .split ("/" )[0 ]
97
- dataset = next ((d for d in usernames [user ] if d . ref == dsn ))
98
- downloads .append (int (dataset . download_count ))
99
- views .append (int (dataset . view_count ))
100
- sizes .append (int (dataset . total_bytes ))
121
+ dataset = vars ( next ((d for d in usernames [user ] if vars ( d )[ ' ref' ] == dsn ) ))
122
+ downloads .append (int (dataset [ 'downloadCount' ] ))
123
+ views .append (int (dataset [ 'viewCount' ] ))
124
+ sizes .append (int (dataset [ 'totalBytes' ] ))
101
125
print (f'{ dsn } done.' )
102
-
103
126
except Exception :
104
127
traceback .print_exc ()
105
- print (f'Error when reading { dsn } ' )
106
- print (f'Continuing with 0 values...' )
128
+ print (f'Error when reading { dsn } , Continuing with 0 values...' )
107
129
downloads .append (0 )
108
130
views .append (0 )
109
131
sizes .append (0 )
110
132
111
133
except Exception :
112
134
traceback .print_exc ()
113
- print (f'Error when reading { dsn } ' )
114
- print (f'Continuing with 0 values...' )
135
+ print (f'Error when reading { dsn } , Continuing with 0 values...' )
115
136
downloads .append (0 )
116
137
views .append (0 )
117
138
sizes .append (0 )
118
139
119
-
120
140
views = np .array (views )
121
141
downloads = np .array (downloads )
122
142
size_in_bytes = np .array (sizes )
123
143
124
144
# SPECIFIC DATASET STATISTICS TO OUTPUT
125
- # Take the maximum of views/downloads from each of the sub-datasets
145
+ # Take the maximum of views from each of the sub-datasets
126
146
# More representative than summing, since the same user would likely view multiple sub-datasets
127
147
ds_size_raw = np .sum (size_in_bytes )
128
148
ds_size = format_bytes (ds_size_raw )
129
149
ds_views = np .max (views ) #np.sum(views)
130
- ds_downs = np .max (downloads ) #np.sum (downloads)
150
+ ds_downs = np .sum (downloads ) #np.max (downloads)
131
151
print (f'{ filename } ({ ds_size } ) processed. { ds_views } views, { ds_downs } downloads.' )
132
152
133
153
if not ds_size_raw :
@@ -144,6 +164,7 @@ def unformat_bytes(string):
144
164
'downloads' : ds_downs ,
145
165
}
146
166
json_dump [filename ] = kaggle_stats
167
+ # breakpoint()
147
168
total_bytes += int (np .sum (downloads * size_in_bytes ))
148
169
total_size += int (np .sum (size_in_bytes ))
149
170
@@ -153,7 +174,9 @@ def unformat_bytes(string):
153
174
154
175
json_dump ['total_bytes' ] = total_bytes
155
176
json_dump ['total_size' ] = total_size
156
-
177
+ print (f'Total size: { format_bytes (total_size )} ' )
178
+ print (f'Total downloaded bytes TB: { total_bytes / 1024 ** 4 } ' )
179
+
157
180
# Update the gist
158
181
# Need the custom encoder class to convert numpy numbers to json readable ones
159
182
class NpEncoder (json .JSONEncoder ):
@@ -169,6 +192,7 @@ def default(self, obj):
169
192
print ('Updating {gist}...' )
170
193
try :
171
194
gist .edit (files = {'kaggle_stats.json' : github .InputFileContent (content = json .dumps (json_dump ,indent = 4 ,cls = NpEncoder ))})
195
+ gist .edit (files = {'datasets_size.json' : github .InputFileContent (content = json .dumps (datasets_size ,indent = 4 ,cls = NpEncoder ))})
172
196
except Exception as e :
173
197
print (f'Could not update { gist } : { e } ' )
174
198
print (f'Dumping to file...' )
0 commit comments