Skip to content

Commit

Permalink
WIP: fix-66
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <mcliffor@redhat.com>
  • Loading branch information
MichaelClifford committed Oct 9, 2024
1 parent b1d174a commit 9b0fd55
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 72 deletions.
166 changes: 98 additions & 68 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -762,26 +762,33 @@ deploymentSpec:
), i)\n for i, model in enumerate(models)\n )[-1]\n \
\ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\
\ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\
\ path_to_model = \"/input_model/model\"\n elif phase_name ==\
\ \"second\":\n path_to_model = list_phase1_final_model()\n\n \
\ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\
\ kind: PyTorchJob\n metadata:\n name: {name}\n \
\ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \
\ pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\
\ export HF_HOME=/tmp\n \
\ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\
\ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\
\ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand All @@ -805,20 +812,28 @@ deploymentSpec:
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model}\
\ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\ export TRITON_CACHE_DIR=/tmp\n \
\ export XDG_CACHE_HOME=/tmp\n export HF_HOME=/tmp\n\
\ export TRANSFORMERS_CACHE=/tmp\n \
\ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\
\ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
\ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n - name:\
\ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\\\"\n resources:\n requests:\n\
\ cpu: 2\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n limits:\n \
Expand Down Expand Up @@ -867,26 +882,33 @@ deploymentSpec:
), i)\n for i, model in enumerate(models)\n )[-1]\n \
\ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\
\ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\
\ path_to_model = \"/input_model/model\"\n elif phase_name ==\
\ \"second\":\n path_to_model = list_phase1_final_model()\n\n \
\ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\
\ kind: PyTorchJob\n metadata:\n name: {name}\n \
\ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \
\ pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\
\ export HF_HOME=/tmp\n \
\ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\
\ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\
\ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand All @@ -910,20 +932,28 @@ deploymentSpec:
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n mkdir -p /tmp/model;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model}\
\ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath:\
\ /input_data\n name: input-data\n \
\ readOnly: true\n - mountPath: /input_model\n\
\ name: model\n readOnly:\
\ true\n - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\ export TRITON_CACHE_DIR=/tmp\n \
\ export XDG_CACHE_HOME=/tmp\n export HF_HOME=/tmp\n\
\ export TRANSFORMERS_CACHE=/tmp\n \
\ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\
\ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\
\ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \
\ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\
\ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\
\ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\
\ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n readOnly: true\n \
\ env:\n - name: NNODES\n \
\ value: \\\\\"{nnodes}\\\\\"\n - name:\
\ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\
\\\"\n resources:\n requests:\n\
\ cpu: 2\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n limits:\n \
Expand Down
15 changes: 11 additions & 4 deletions training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,11 @@ def list_phase1_final_model():

Outputs = NamedTuple("outputs", manifest=str, name=str)
name = f"train-{phase_name}-{name_suffix.rstrip('-sdg')}"

image = "quay.io/shanand/test-train:0.0.4"
if phase_name == "first":
path_to_model = "/input_model/model"
elif phase_name == "second":
path_to_model = list_phase1_final_model()
image = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989"

manifest = inspect.cleandoc(
f"""
Expand All @@ -134,7 +133,11 @@ def list_phase1_final_model():
- |
mkdir -p /output/model;
mkdir -p /output/data;
python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /output/model --data_output_dir /input_data/processed_data
export XDG_CACHE_HOME=/tmp
export TRITON_CACHE_DIR=/tmp
export HF_HOME=/tmp
export TRANSFORMERS_CACHE=/tmp
torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch
command:
- /bin/bash
- '-c'
Expand Down Expand Up @@ -184,7 +187,11 @@ def list_phase1_final_model():
- args:
- |
mkdir -p /tmp/model;
python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data
export TRITON_CACHE_DIR=/tmp
export XDG_CACHE_HOME=/tmp
export HF_HOME=/tmp
export TRANSFORMERS_CACHE=/tmp
torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch
command:
- /bin/bash
- '-c'
Expand Down

0 comments on commit 9b0fd55

Please sign in to comment.