Try fix git

Files changed (13) hide show

README.md CHANGED Viewed

File without changes

added_tokens.json CHANGED Viewed

File without changes

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "gpt2-large",
   "activation_function": "gelu_new",
   "architectures": [
     "GPT2LMHeadModel"

 {
+  "_name_or_path": "./",
   "activation_function": "gelu_new",
   "architectures": [
     "GPT2LMHeadModel"

flax_model_to_pytorch.py CHANGED Viewed

File without changes

merges.txt CHANGED Viewed

File without changes

replace_token_script.py CHANGED Viewed

File without changes

run_clm_flax.py CHANGED Viewed

@@ -61,7 +61,7 @@ from transformers import (
 from transformers.file_utils import get_full_repo_name
 from transformers.testing_utils import CaptureLogger
-from distributed_shampoo import distributed_shampoo, GraftingType
 logger = logging.getLogger(__name__)

 from transformers.file_utils import get_full_repo_name
 from transformers.testing_utils import CaptureLogger
+# from distributed_shampoo import distributed_shampoo, GraftingType
 logger = logging.getLogger(__name__)

special_tokens_map.json CHANGED Viewed

File without changes

start_train.sh CHANGED Viewed

@@ -10,19 +10,19 @@ python3 run_clm_flax.py \
     --dataset_filepath="/researchdisk/training_dataset_full_deduplicated" \
     --do_train --do_eval \
     --block_size="512" \
-    --per_device_train_batch_size="16" \
-    --per_device_eval_batch_size="16" \
-    --preprocessing_num_workers="1" \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
-    --learning_rate="1e-5" \
     --weight_decay="0.01" \
     --warmup_steps="4000" \
     --cosine_decay \
     --overwrite_output_dir \
     --logging_steps="500" \
-    --eval_steps="10000" \
-    --save_steps="50000" \
     --num_train_epochs="5" \
     --dtype="bfloat16" \
     --push_to_hub \

     --dataset_filepath="/researchdisk/training_dataset_full_deduplicated" \
     --do_train --do_eval \
     --block_size="512" \
+    --per_device_train_batch_size="8" \
+    --per_device_eval_batch_size="8" \
+    --preprocessing_num_workers="96" \
     --adam_beta1="0.9" \
     --adam_beta2="0.98" \
+    --learning_rate="4e-5" \
     --weight_decay="0.01" \
     --warmup_steps="4000" \
     --cosine_decay \
     --overwrite_output_dir \
     --logging_steps="500" \
+    --eval_steps="100" \
+    --save_steps="100" \
     --num_train_epochs="5" \
     --dtype="bfloat16" \
     --push_to_hub \

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

File without changes

train_tokenizer.py CHANGED Viewed

File without changes

vocab.json CHANGED Viewed

File without changes