Skip to content

Commit 9f9dc0a

Browse files
authored
Attempt fix for Kaggle API ratelimit
Paginated scrape by username and store all returned datasets in memory
1 parent 24cb1ef commit 9f9dc0a

File tree

1 file changed

+58
-42
lines changed

1 file changed

+58
-42
lines changed

kaggle_json.py

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,24 @@
77
g = Github(GITHUB_TOKEN)
88
gist = g.get_gist(GIST_ID)
99

10-
# Function to extract kaggle identifiers from the file
11-
def extract_kaggle(file_path):
12-
kaggle_identifiers = []
13-
kaggle_url_pattern = r'https://www\.kaggle\.com/datasets/([^/]+)/([^"]+)'
14-
with open(file_path, 'r') as file:
15-
text = file.read()
16-
matches = re.findall(kaggle_url_pattern, text)
17-
kaggle_identifiers = [f"{username}/{dataset}" for username, dataset in matches]
18-
return kaggle_identifiers
19-
20-
dir = "blastnet.github.io/_datasets"
10+
# Format the filesize to unit'ed format
11+
def format_bytes(num_bytes):
12+
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
13+
factor = 1000
14+
unit_index = 0
15+
while num_bytes >= factor and unit_index < len(units)-1:
16+
num_bytes /= factor
17+
unit_index += 1
18+
return f"{num_bytes:.3f} {units[unit_index]}"
19+
20+
# Unformat the unit'ed format to the raw filesize (estimate)
21+
def unformat_bytes(string):
22+
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
23+
num,unit = string.split(" ")
24+
factor = 1000
25+
return float(num)*(factor**(units.index(unit)))
26+
27+
dir = "_datasets"
2128
total_bytes = 0
2229
total_size = 0
2330
json_dump = {}
@@ -28,59 +35,71 @@ def extract_kaggle(file_path):
2835
api_instance = kaggle.KaggleApi()
2936
api_instance.authenticate()
3037

38+
# Save array of unique usernames across all files to reduce API call
39+
usernames = {}
40+
search_map = {}
41+
3142
# For every file in the datasets folder
3243
for filename in os.listdir(dir):
3344
print(f'Currently updating {filename}...')
34-
dataset_names = []
3545

3646
# Extract all the kaggle dataset identifiers from the page
3747
filepath = os.path.join(dir,filename)
3848
if os.path.isfile(filepath):
39-
dataset_names = extract_kaggle(filepath)
40-
if not dataset_names:
41-
continue
42-
49+
kaggle_url_pattern = r'https://www\.kaggle\.com/datasets/([^/]+)/([^"]+)'
50+
with open(filepath,'r') as file:
51+
text = file.read()
52+
matches = re.findall(kaggle_url_pattern,text)
53+
search_map[filename] = []
54+
# Loop through all dataset identifiers in the file
55+
for username,dataset in matches:
56+
# If owner not yet recorded, scrape ALL datasets from owner
57+
if username not in usernames:
58+
new_set = []
59+
cur_len = 20
60+
i = 1
61+
# Pagination length is 20
62+
while cur_len == 20:
63+
new_search = api_instance.dataset_list(search=username,page=i)
64+
new_set.extend(new_search)
65+
cur_len = len(new_search)
66+
i+=1
67+
# Save all the scrapes to the dictionary
68+
# Note items of new_set are Kaggle metadata objects
69+
usernames[username] = new_set
70+
71+
# Record all the identifiers to the filename
72+
search_map[filename].append(f'{username}/{dataset}')
73+
74+
# At this point we have done all the necessary scraping from Kaggle API calls
75+
for filename in search_map:
76+
dataset_names = search_map[filename]
77+
4378
downloads = []
4479
views = []
4580
sizes = []
46-
81+
4782
for dsn in dataset_names:
4883
print(f'Processing {dsn}...')
4984
try:
50-
# This way is slower but only needs 1 API call per kaggle subdataset
51-
# Instead of iterating over all the files multiple times
52-
dataset = vars(api_instance.dataset_list(search=dsn)[0])
85+
user = dsn.split("/")[0]
86+
dataset = vars(next((d for d in usernames[user] if vars(d)['ref'] == dsn)))
5387
downloads.append(int(dataset['downloadCount']))
5488
views.append(int(dataset['viewCount']))
5589
sizes.append(int(dataset['totalBytes']))
5690
print(f'{dsn} done.')
57-
91+
5892
except Exception as e:
5993
print(f'{e} when reading {dsn}')
6094
print(f'Continuing with 0 values...')
6195
downloads.append(0)
6296
views.append(0)
6397
sizes.append(0)
64-
98+
6599
views = np.array(views)
66100
downloads = np.array(downloads)
67101
size_in_bytes = np.array(sizes)
68-
69-
# Format the filesize to unit'ed format
70-
def format_bytes(num_bytes):
71-
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
72-
factor = 1000
73-
unit_index = 0
74-
while num_bytes >= factor and unit_index < len(units)-1:
75-
num_bytes /= factor
76-
unit_index += 1
77-
return f"{num_bytes:.3f} {units[unit_index]}"
78-
def unformat_bytes(string):
79-
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
80-
num,unit = string.split(" ")
81-
factor = 1000
82-
return float(num)*(factor**(units.index(unit)))
83-
102+
84103
# SPECIFIC DATASET STATISTICS TO OUTPUT
85104
# Take the maximum of views/downloads from each of the sub-datasets
86105
# More representative than summing, since the same user would likely view multiple sub-datasets
@@ -110,13 +129,10 @@ def unformat_bytes(string):
110129
if not total_bytes:
111130
raise Exception("Zero data encountered, exiting")
112131
exit()
113-
#old_data = json.loads(gist.files['kaggle_stats.json'].content)
114-
#total_bytes = old_data['total_bytes']
115-
#total_size = old_data['total_size']
116132

117133
json_dump['total_bytes'] = total_bytes
118134
json_dump['total_size'] = total_size
119-
135+
120136
# Update the gist
121137
# Need the custom encoder class to convert numpy numbers to json readable ones
122138
class NpEncoder(json.JSONEncoder):

0 commit comments

Comments
 (0)