|
|
""" |
|
|
FineWeb dataset (for srs pretraining) |
|
|
https://huggingface.co/datasets/HuggingFaceFW/fineweb |
|
|
|
|
|
example doc to highlight the structure of the dataset: |
|
|
{ |
|
|
"text": "Posted by mattsmith on 20th April 2012\nStraight from...", |
|
|
"id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>", |
|
|
"dump": "CC-MAIN-2013-20", |
|
|
"url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/", |
|
|
"date": "2013-05-18T07:24:47Z", |
|
|
"file_path": "s3://commoncrawl/long.../path.../file.gz", |
|
|
"language": "en", |
|
|
"language_score": 0.9185474514961243, |
|
|
"token_count": 594 |
|
|
} |
|
|
""" |
|
|
import os |
|
|
import argparse |
|
|
import multiprocessing as mp |
|
|
import numpy as np |
|
|
import tiktoken |
|
|
|
|
|
from datasets import load_dataset |
|
|
from tqdm import tqdm |
|
|
import argparse |
|
|
import numpy as np |
|
|
def write_datafile(filename, toks): |
|
|
""" |
|
|
Saves token data as a .bin file, for reading in C. |
|
|
- First comes a header with 256 int32s |
|
|
- The tokens follow, each as a uint16 |
|
|
""" |
|
|
assert len(toks) < 2**31, "token count too large" |
|
|
|
|
|
header = np.zeros(256, dtype=np.int32) |
|
|
header[0] = 20240520 |
|
|
header[1] = 1 |
|
|
header[2] = len(toks) |
|
|
|
|
|
if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16: |
|
|
|
|
|
maxtok = 2**16 |
|
|
assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16" |
|
|
toks_np = np.array(toks, dtype=np.uint16) |
|
|
else: |
|
|
toks_np = toks |
|
|
|
|
|
print(f"writing {len(toks):,} tokens to {filename}") |
|
|
with open(filename, "wb") as f: |
|
|
f.write(header.tobytes()) |
|
|
f.write(toks_np.tobytes()) |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing") |
|
|
parser.add_argument("-v", "--version", type=str, default="10B", help="Which version of fineweb to use 10B|100B") |
|
|
parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each shard in tokens") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
assert args.version in ["10B", "100B"], "version must be one of 10B, 100B" |
|
|
if args.version == "10B": |
|
|
local_dir = "fineweb10B" |
|
|
remote_name = "sample-10BT" |
|
|
elif args.version == "100B": |
|
|
local_dir = "fineweb100B" |
|
|
remote_name = "sample-100BT" |
|
|
|
|
|
|
|
|
DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir) |
|
|
os.makedirs(DATA_CACHE_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train") |
|
|
|
|
|
|
|
|
enc = tiktoken.get_encoding("gpt2") |
|
|
eot = enc._special_tokens['<|endoftext|>'] |
|
|
def tokenize(doc): |
|
|
|
|
|
tokens = [eot] |
|
|
tokens.extend(enc.encode_ordinary(doc["text"])) |
|
|
tokens_np = np.array(tokens) |
|
|
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16" |
|
|
tokens_np_uint16 = tokens_np.astype(np.uint16) |
|
|
return tokens_np_uint16 |
|
|
|
|
|
|
|
|
nprocs = max(1, os.cpu_count() - 2) |
|
|
with mp.Pool(nprocs) as pool: |
|
|
shard_index = 0 |
|
|
|
|
|
all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16) |
|
|
token_count = 0 |
|
|
progress_bar = None |
|
|
for tokens in pool.imap(tokenize, fw, chunksize=16): |
|
|
|
|
|
|
|
|
if token_count + len(tokens) < args.shard_size: |
|
|
|
|
|
all_tokens_np[token_count:token_count+len(tokens)] = tokens |
|
|
token_count += len(tokens) |
|
|
|
|
|
if progress_bar is None: |
|
|
progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}") |
|
|
progress_bar.update(len(tokens)) |
|
|
else: |
|
|
|
|
|
split = "val" if shard_index == 0 else "train" |
|
|
filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin") |
|
|
|
|
|
remainder = args.shard_size - token_count |
|
|
progress_bar.update(remainder) |
|
|
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder] |
|
|
write_datafile(filename, all_tokens_np) |
|
|
shard_index += 1 |
|
|
progress_bar = None |
|
|
|
|
|
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:] |
|
|
token_count = len(tokens)-remainder |
|
|
|
|
|
|
|
|
if token_count != 0: |
|
|
split = "val" if shard_index == 0 else "train" |
|
|
filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin") |
|
|
write_datafile(filename, all_tokens_np[:token_count]) |
|
|
|