Skip to content

Commit 8eea134

Browse files
committed
fixed size and download issue
1 parent 4481799 commit 8eea134

File tree

1 file changed

+48
-24
lines changed

1 file changed

+48
-24
lines changed

kaggle_json.py

Lines changed: 48 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33

44
GITHUB_TOKEN = os.getenv("GIST_TOKEN")
55
GIST_ID = "c9112c25c5acd400b90741efa81aa411"
6+
USE_TRUE_SIZE = True
67

78
g = Github(GITHUB_TOKEN)
89
gist = g.get_gist(GIST_ID)
910

1011
# Format the filesize to unit'ed format
1112
def format_bytes(num_bytes):
1213
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
13-
factor = 1000
14+
factor = 1024
1415
unit_index = 0
1516
while num_bytes >= factor and unit_index < len(units)-1:
1617
num_bytes /= factor
@@ -21,7 +22,7 @@ def format_bytes(num_bytes):
2122
def unformat_bytes(string):
2223
units = ['B','KB','MB','GB','TB','PB','EB','ZB','YB']
2324
num,unit = string.split(" ")
24-
factor = 1000
25+
factor = 1024
2526
return float(num)*(factor**(units.index(unit)))
2627

2728
dir = "_datasets"
@@ -70,7 +71,24 @@ def unformat_bytes(string):
7071

7172
# Record all the identifiers to the filename
7273
search_map[filename].append(f'{username}/{dataset}')
73-
74+
75+
# remove sharma kaggle model
76+
search_map["sharma2024.md"] = [x for x in search_map["sharma2024.md"] if x !='sharmapushan/pimapnet']
77+
usernames["sharmapushan"] = [x for x in usernames["sharmapushan"] if x.ref != 'sharmapushan/pimapnet']
78+
79+
# read true dataset sizes from json file
80+
try:
81+
datasets_size = json.loads(gist.files['datasets_size.json'].content)
82+
print(f"Read {len(datasets_size)} dataset sizes from gist")
83+
except Exception as e:
84+
print(f'Could not read datasets_size.json from gist: {e}')
85+
if os.path.exists('datasets_size.json'):
86+
with open('datasets_size.json','r') as f:
87+
datasets_size = json.load(f)
88+
print('Loading datasets_size.json from local file...')
89+
else:
90+
raise Exception("No datasets_size.json found")
91+
7492
# At this point we have done all the necessary scraping from Kaggle API calls
7593
for filename in search_map:
7694
dataset_names = search_map[filename]
@@ -81,53 +99,55 @@ def unformat_bytes(string):
8199

82100
for dsn in dataset_names:
83101
print(f'Processing {dsn}...')
84-
# Old Kaggle Api <1.7
102+
# New Kaggle Api >=1.7
85103
try:
86-
user = dsn.split("/")[0]
87-
dataset = vars(next((d for d in usernames[user] if vars(d)['ref'] == dsn)))
88-
downloads.append(int(dataset['downloadCount']))
89-
views.append(int(dataset['viewCount']))
90-
sizes.append(int(dataset['totalBytes']))
104+
user, dataset_id = dsn.split("/")
105+
dataset = next((d for d in usernames[user] if d.ref == dsn))
106+
downloads.append(int(dataset.download_count))
107+
views.append(int(dataset.view_count))
108+
if USE_TRUE_SIZE:
109+
# Use the true size from the json file
110+
if dataset_id in datasets_size.keys():
111+
sizes.append(int(datasets_size[dataset_id]))
112+
else:
113+
raise Exception(f"Dataset {dataset_id} not found in datasets_size.json")
114+
else:
115+
sizes.append(int(dataset.total_bytes))
91116
print(f'{dsn} done.')
92-
93-
# New Kaggle Api >=1.7
94117
except KeyError:
118+
# Old Kaggle Api <1.7
95119
try:
96120
user = dsn.split("/")[0]
97-
dataset = next((d for d in usernames[user] if d.ref == dsn))
98-
downloads.append(int(dataset.download_count))
99-
views.append(int(dataset.view_count))
100-
sizes.append(int(dataset.total_bytes))
121+
dataset = vars(next((d for d in usernames[user] if vars(d)['ref'] == dsn)))
122+
downloads.append(int(dataset['downloadCount']))
123+
views.append(int(dataset['viewCount']))
124+
sizes.append(int(dataset['totalBytes']))
101125
print(f'{dsn} done.')
102-
103126
except Exception:
104127
traceback.print_exc()
105-
print(f'Error when reading {dsn}')
106-
print(f'Continuing with 0 values...')
128+
print(f'Error when reading {dsn}, Continuing with 0 values...')
107129
downloads.append(0)
108130
views.append(0)
109131
sizes.append(0)
110132

111133
except Exception:
112134
traceback.print_exc()
113-
print(f'Error when reading {dsn}')
114-
print(f'Continuing with 0 values...')
135+
print(f'Error when reading {dsn}, Continuing with 0 values...')
115136
downloads.append(0)
116137
views.append(0)
117138
sizes.append(0)
118139

119-
120140
views = np.array(views)
121141
downloads = np.array(downloads)
122142
size_in_bytes = np.array(sizes)
123143

124144
# SPECIFIC DATASET STATISTICS TO OUTPUT
125-
# Take the maximum of views/downloads from each of the sub-datasets
145+
# Take the maximum of views from each of the sub-datasets
126146
# More representative than summing, since the same user would likely view multiple sub-datasets
127147
ds_size_raw = np.sum(size_in_bytes)
128148
ds_size = format_bytes(ds_size_raw)
129149
ds_views = np.max(views) #np.sum(views)
130-
ds_downs = np.max(downloads) #np.sum(downloads)
150+
ds_downs = np.sum(downloads) #np.max(downloads)
131151
print(f'{filename} ({ds_size}) processed. {ds_views} views, {ds_downs} downloads.')
132152

133153
if not ds_size_raw:
@@ -144,6 +164,7 @@ def unformat_bytes(string):
144164
'downloads': ds_downs,
145165
}
146166
json_dump[filename] = kaggle_stats
167+
# breakpoint()
147168
total_bytes += int(np.sum(downloads*size_in_bytes))
148169
total_size += int(np.sum(size_in_bytes))
149170

@@ -153,7 +174,9 @@ def unformat_bytes(string):
153174

154175
json_dump['total_bytes'] = total_bytes
155176
json_dump['total_size'] = total_size
156-
177+
print(f'Total size: {format_bytes(total_size)}')
178+
print(f'Total downloaded bytes TB: {total_bytes/1024**4}')
179+
157180
# Update the gist
158181
# Need the custom encoder class to convert numpy numbers to json readable ones
159182
class NpEncoder(json.JSONEncoder):
@@ -169,6 +192,7 @@ def default(self, obj):
169192
print('Updating {gist}...')
170193
try:
171194
gist.edit(files={'kaggle_stats.json': github.InputFileContent(content=json.dumps(json_dump,indent=4,cls=NpEncoder))})
195+
gist.edit(files={'datasets_size.json': github.InputFileContent(content=json.dumps(datasets_size,indent=4,cls=NpEncoder))})
172196
except Exception as e:
173197
print(f'Could not update {gist}: {e}')
174198
print(f'Dumping to file...')

0 commit comments

Comments
 (0)