Castillo_Henry_903002104 / data /cached_fineweb100B.py
henrycastillo's picture
add everything but lm eval harness
c3b20da verified
raw
history blame contribute delete
752 Bytes
import os
import sys
from huggingface_hub import hf_hub_download
# Download the GPT-2 tokens of Fineweb100B from huggingface. This
# saves about an hour of startup time compared to regenerating them.
def get(fname):
local_dir = os.path.join(os.path.dirname(__file__), 'fineweb100B')
if not os.path.exists(os.path.join(local_dir, fname)):
hf_hub_download(repo_id="kjj0/fineweb100B-gpt2", filename=fname,
repo_type="dataset", local_dir=local_dir)
get("fineweb_val_%06d.bin" % 0)
num_chunks = 1030 # full fineweb100B. Each chunk is 100M tokens
if len(sys.argv) >= 2: # we can pass an argument to download less
num_chunks = int(sys.argv[1])
for i in range(1, num_chunks+1):
get("fineweb_train_%06d.bin" % i)