diff --git a/pipeline.yaml b/pipeline.yaml index 9263a72..3e9b3a9 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -762,26 +762,33 @@ deploymentSpec: ), i)\n for i, model in enumerate(models)\n )[-1]\n \ \ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\ \n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\ - \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\ - \ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\ - \ path_to_model = \"/input_model/model\"\n elif phase_name ==\ - \ \"second\":\n path_to_model = list_phase1_final_model()\n\n \ - \ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\ - \ kind: PyTorchJob\n metadata:\n name: {name}\n \ - \ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \ - \ pytorchReplicaSpecs:\n Master:\n replicas:\ - \ 1\n restartPolicy: OnFailure\n template:\n \ - \ metadata:\n annotations:\n \ - \ sidecar.istio.io/inject: 'false'\n spec:\n \ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /output/model;\n \ - \ mkdir -p /output/data;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\ - \ /output/model --data_output_dir /input_data/processed_data\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ + \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\ + \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\ + \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\ + \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ + \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ + \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ + \\\"\n pytorchReplicaSpecs:\n Master:\n \ + \ replicas: 1\n restartPolicy: OnFailure\n template:\n\ + \ metadata:\n annotations:\n \ + \ sidecar.istio.io/inject: 'false'\n spec:\n \ + \ containers:\n - args:\n \ + \ - |\n mkdir -p /output/model;\n \ + \ mkdir -p /output/data;\n \ + \ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\ + \ export HF_HOME=/tmp\n \ + \ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\ + \ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\ + \ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ \ name: input-data\n readOnly:\ \ true\n - mountPath: /input_model\n \ \ name: model\n readOnly: true\n \ @@ -805,20 +812,28 @@ deploymentSpec: \ sidecar.istio.io/inject: 'false'\n spec:\n\ \ containers:\n - args:\n \ \ - |\n mkdir -p /tmp/model;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model}\ - \ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ + \ export TRITON_CACHE_DIR=/tmp\n \ + \ export XDG_CACHE_HOME=/tmp\n export HF_HOME=/tmp\n\ + \ export TRANSFORMERS_CACHE=/tmp\n \ + \ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\ + \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ + \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\\\"{nnodes}\\\\\"\n - name:\ + \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ \\\"\n resources:\n requests:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ @@ -867,26 +882,33 @@ deploymentSpec: ), i)\n for i, model in enumerate(models)\n )[-1]\n \ \ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\ \n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\ - \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\ - \ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\ - \ path_to_model = \"/input_model/model\"\n elif phase_name ==\ - \ \"second\":\n path_to_model = list_phase1_final_model()\n\n \ - \ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\ - \ kind: PyTorchJob\n metadata:\n name: {name}\n \ - \ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \ - \ pytorchReplicaSpecs:\n Master:\n replicas:\ - \ 1\n restartPolicy: OnFailure\n template:\n \ - \ metadata:\n annotations:\n \ - \ sidecar.istio.io/inject: 'false'\n spec:\n \ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /output/model;\n \ - \ mkdir -p /output/data;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\ - \ /output/model --data_output_dir /input_data/processed_data\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ + \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\ + \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\ + \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\ + \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ + \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ + \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ + \\\"\n pytorchReplicaSpecs:\n Master:\n \ + \ replicas: 1\n restartPolicy: OnFailure\n template:\n\ + \ metadata:\n annotations:\n \ + \ sidecar.istio.io/inject: 'false'\n spec:\n \ + \ containers:\n - args:\n \ + \ - |\n mkdir -p /output/model;\n \ + \ mkdir -p /output/data;\n \ + \ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\ + \ export HF_HOME=/tmp\n \ + \ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\ + \ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\ + \ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ \ name: input-data\n readOnly:\ \ true\n - mountPath: /input_model\n \ \ name: model\n readOnly: true\n \ @@ -910,20 +932,28 @@ deploymentSpec: \ sidecar.istio.io/inject: 'false'\n spec:\n\ \ containers:\n - args:\n \ \ - |\n mkdir -p /tmp/model;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model}\ - \ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ + \ export TRITON_CACHE_DIR=/tmp\n \ + \ export XDG_CACHE_HOME=/tmp\n export HF_HOME=/tmp\n\ + \ export TRANSFORMERS_CACHE=/tmp\n \ + \ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\ + \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ + \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\\\"{nnodes}\\\\\"\n - name:\ + \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ \\\"\n resources:\n requests:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ diff --git a/training/components.py b/training/components.py index 2c4a1d4..f1239d4 100644 --- a/training/components.py +++ b/training/components.py @@ -105,12 +105,11 @@ def list_phase1_final_model(): Outputs = NamedTuple("outputs", manifest=str, name=str) name = f"train-{phase_name}-{name_suffix.rstrip('-sdg')}" - - image = "quay.io/shanand/test-train:0.0.4" if phase_name == "first": path_to_model = "/input_model/model" elif phase_name == "second": path_to_model = list_phase1_final_model() + image = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989" manifest = inspect.cleandoc( f""" @@ -134,7 +133,11 @@ def list_phase1_final_model(): - | mkdir -p /output/model; mkdir -p /output/data; - python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /output/model --data_output_dir /input_data/processed_data + export XDG_CACHE_HOME=/tmp + export TRITON_CACHE_DIR=/tmp + export HF_HOME=/tmp + export TRANSFORMERS_CACHE=/tmp + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch command: - /bin/bash - '-c' @@ -184,7 +187,11 @@ def list_phase1_final_model(): - args: - | mkdir -p /tmp/model; - python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data + export TRITON_CACHE_DIR=/tmp + export XDG_CACHE_HOME=/tmp + export HF_HOME=/tmp + export TRANSFORMERS_CACHE=/tmp + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch command: - /bin/bash - '-c'