-
Notifications
You must be signed in to change notification settings - Fork 547
/
Copy pathdbrx-full-ft.yaml
137 lines (122 loc) · 2.76 KB
/
dbrx-full-ft.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
variables:
# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME
# Note: This requires ~64x80GB GPUs
max_seq_len: 4096
icl_seq_len: 1024
run_name: ${variables.run_name}
max_seq_len: ${variables.max_seq_len}
icl_seq_len: ${variables.icl_seq_len}
# Model
model:
name: hf_causal_lm
pretrained: true
init_device: mixed
use_auth_token: true
config_overrides: {}
use_flash_attention_2: true
pretrained_model_name_or_path: databricks/dbrx-instruct
# Tokenizer
tokenizer:
name: databricks/dbrx-instruct
kwargs:
model_max_length: ${variables.max_seq_len}
trust_remote_code: true
# Dataloaders
train_loader:
name: finetuning
dataset:
split: train
hf_name: mosaicml/dolly_hhrlhf
shuffle: true
max_seq_len: ${variables.max_seq_len}
eos_token_id: 0
packing_ratio: auto
allow_pad_trimming: false
decoder_only_format: true
drop_last: true
pin_memory: true
num_workers: 8
prefetch_factor: 2
persistent_workers: true
eval_loader:
name: finetuning
dataset:
split: test
hf_name: mosaicml/dolly_hhrlhf
shuffle: false
max_seq_len: ${variables.max_seq_len}
packing_ratio: null
allow_pad_trimming: false
decoder_only_format: true
drop_last: true
pin_memory: true
num_workers: 8
prefetch_factor: 2
persistent_workers: true
# Optimization
optimizer:
lr: 0.000001
name: decoupled_lionw
betas:
- 0.9
- 0.95
weight_decay: 1.0e-06
scheduler:
name: cosine_with_warmup
alpha_f: 0
t_warmup: 0.02dur
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1
max_duration: 2ep
eval_interval: 1ep
global_train_batch_size: 64
eval_first: false
# eval_subset_num_batches: -1
# System
seed: 17
device_train_microbatch_size: 1
device_eval_batch_size: 1
precision: amp_bf16
dist_timeout: 3600
expandable_segments: true
# FSDP
fsdp_config:
mixed_precision: PURE
state_dict_type: sharded
limit_all_gathers: true
sharding_strategy: FULL_SHARD
activation_cpu_offload: false
activation_checkpointing: true
activation_checkpointing_reentrant: false
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
# Callbacks
callbacks:
lr_monitor: {}
speed_monitor:
window_size: 1
memory_monitor: {}
hf_checkpointer:
overwrite: true
precision: bfloat16
save_folder: ./{run_name}/checkpoints
save_interval: 1dur
runtime_estimator: {}
# Checkpoint to local filesystem or remote object store
# save_interval: 5000ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
# Logging
# loggers:
# wandb:
# name:
# group:
# mlflow:
# tracking_uri:
# experiment_name: