7
7
g = Github (GITHUB_TOKEN )
8
8
gist = g .get_gist (GIST_ID )
9
9
10
- # Function to extract kaggle identifiers from the file
11
- def extract_kaggle (file_path ):
12
- kaggle_identifiers = []
13
- kaggle_url_pattern = r'https://www\.kaggle\.com/datasets/([^/]+)/([^"]+)'
14
- with open (file_path , 'r' ) as file :
15
- text = file .read ()
16
- matches = re .findall (kaggle_url_pattern , text )
17
- kaggle_identifiers = [f"{ username } /{ dataset } " for username , dataset in matches ]
18
- return kaggle_identifiers
19
-
20
- dir = "blastnet.github.io/_datasets"
10
+ # Format the filesize to unit'ed format
11
+ def format_bytes (num_bytes ):
12
+ units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
13
+ factor = 1000
14
+ unit_index = 0
15
+ while num_bytes >= factor and unit_index < len (units )- 1 :
16
+ num_bytes /= factor
17
+ unit_index += 1
18
+ return f"{ num_bytes :.3f} { units [unit_index ]} "
19
+
20
+ # Unformat the unit'ed format to the raw filesize (estimate)
21
+ def unformat_bytes (string ):
22
+ units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
23
+ num ,unit = string .split (" " )
24
+ factor = 1000
25
+ return float (num )* (factor ** (units .index (unit )))
26
+
27
+ dir = "_datasets"
21
28
total_bytes = 0
22
29
total_size = 0
23
30
json_dump = {}
@@ -28,59 +35,71 @@ def extract_kaggle(file_path):
28
35
api_instance = kaggle .KaggleApi ()
29
36
api_instance .authenticate ()
30
37
38
+ # Save array of unique usernames across all files to reduce API call
39
+ usernames = {}
40
+ search_map = {}
41
+
31
42
# For every file in the datasets folder
32
43
for filename in os .listdir (dir ):
33
44
print (f'Currently updating { filename } ...' )
34
- dataset_names = []
35
45
36
46
# Extract all the kaggle dataset identifiers from the page
37
47
filepath = os .path .join (dir ,filename )
38
48
if os .path .isfile (filepath ):
39
- dataset_names = extract_kaggle (filepath )
40
- if not dataset_names :
41
- continue
42
-
49
+ kaggle_url_pattern = r'https://www\.kaggle\.com/datasets/([^/]+)/([^"]+)'
50
+ with open (filepath ,'r' ) as file :
51
+ text = file .read ()
52
+ matches = re .findall (kaggle_url_pattern ,text )
53
+ search_map [filename ] = []
54
+ # Loop through all dataset identifiers in the file
55
+ for username ,dataset in matches :
56
+ # If owner not yet recorded, scrape ALL datasets from owner
57
+ if username not in usernames :
58
+ new_set = []
59
+ cur_len = 20
60
+ i = 1
61
+ # Pagination length is 20
62
+ while cur_len == 20 :
63
+ new_search = api_instance .dataset_list (search = username ,page = i )
64
+ new_set .extend (new_search )
65
+ cur_len = len (new_search )
66
+ i += 1
67
+ # Save all the scrapes to the dictionary
68
+ # Note items of new_set are Kaggle metadata objects
69
+ usernames [username ] = new_set
70
+
71
+ # Record all the identifiers to the filename
72
+ search_map [filename ].append (f'{ username } /{ dataset } ' )
73
+
74
+ # At this point we have done all the necessary scraping from Kaggle API calls
75
+ for filename in search_map :
76
+ dataset_names = search_map [filename ]
77
+
43
78
downloads = []
44
79
views = []
45
80
sizes = []
46
-
81
+
47
82
for dsn in dataset_names :
48
83
print (f'Processing { dsn } ...' )
49
84
try :
50
- # This way is slower but only needs 1 API call per kaggle subdataset
51
- # Instead of iterating over all the files multiple times
52
- dataset = vars (api_instance .dataset_list (search = dsn )[0 ])
85
+ user = dsn .split ("/" )[0 ]
86
+ dataset = vars (next ((d for d in usernames [user ] if vars (d )['ref' ] == dsn )))
53
87
downloads .append (int (dataset ['downloadCount' ]))
54
88
views .append (int (dataset ['viewCount' ]))
55
89
sizes .append (int (dataset ['totalBytes' ]))
56
90
print (f'{ dsn } done.' )
57
-
91
+
58
92
except Exception as e :
59
93
print (f'{ e } when reading { dsn } ' )
60
94
print (f'Continuing with 0 values...' )
61
95
downloads .append (0 )
62
96
views .append (0 )
63
97
sizes .append (0 )
64
-
98
+
65
99
views = np .array (views )
66
100
downloads = np .array (downloads )
67
101
size_in_bytes = np .array (sizes )
68
-
69
- # Format the filesize to unit'ed format
70
- def format_bytes (num_bytes ):
71
- units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
72
- factor = 1000
73
- unit_index = 0
74
- while num_bytes >= factor and unit_index < len (units )- 1 :
75
- num_bytes /= factor
76
- unit_index += 1
77
- return f"{ num_bytes :.3f} { units [unit_index ]} "
78
- def unformat_bytes (string ):
79
- units = ['B' ,'KB' ,'MB' ,'GB' ,'TB' ,'PB' ,'EB' ,'ZB' ,'YB' ]
80
- num ,unit = string .split (" " )
81
- factor = 1000
82
- return float (num )* (factor ** (units .index (unit )))
83
-
102
+
84
103
# SPECIFIC DATASET STATISTICS TO OUTPUT
85
104
# Take the maximum of views/downloads from each of the sub-datasets
86
105
# More representative than summing, since the same user would likely view multiple sub-datasets
@@ -110,13 +129,10 @@ def unformat_bytes(string):
110
129
if not total_bytes :
111
130
raise Exception ("Zero data encountered, exiting" )
112
131
exit ()
113
- #old_data = json.loads(gist.files['kaggle_stats.json'].content)
114
- #total_bytes = old_data['total_bytes']
115
- #total_size = old_data['total_size']
116
132
117
133
json_dump ['total_bytes' ] = total_bytes
118
134
json_dump ['total_size' ] = total_size
119
-
135
+
120
136
# Update the gist
121
137
# Need the custom encoder class to convert numpy numbers to json readable ones
122
138
class NpEncoder (json .JSONEncoder ):
0 commit comments