diff --git a/pipeline.yaml b/pipeline.yaml index 60da312..440cec5 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -852,61 +852,73 @@ deploymentSpec: \ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\ \ inspect\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n\ \ name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n \ - \ image = \"quay.io/shanand/test-train:0.0.4\"\n\n manifest = inspect.cleandoc(\n\ - \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\ - \ metadata:\n name: {name}\n spec:\n nprocPerNode:\ - \ \\\\\"{nproc_per_node}\\\\\"\n pytorchReplicaSpecs:\n \ - \ Master:\n replicas: 1\n restartPolicy: OnFailure\n\ - \ template:\n metadata:\n annotations:\n\ - \ sidecar.istio.io/inject: 'false'\n spec:\n\ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /output/model;\n\ - \ mkdir -p /output/data;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\ - \ /output/model --data_output_dir /input_data/processed_data\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ + \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ + \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ + \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ + \\\"\n pytorchReplicaSpecs:\n Master:\n \ + \ replicas: 1\n restartPolicy: OnFailure\n template:\n\ + \ metadata:\n annotations:\n \ + \ sidecar.istio.io/inject: 'false'\n spec:\n \ + \ containers:\n - args:\n \ + \ - |\n mkdir -p /output/model;\n \ + \ mkdir -p /output/data;\n \ + \ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\ + \ export HF_HOME=/tmp\n \ + \ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\ + \ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\ + \ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ \ name: input-data\n readOnly:\ \ true\n - mountPath: /input_model\n \ \ name: model\n readOnly: true\n \ \ - mountPath: /output\n \ - \ name: output\n env:\n - name:\ - \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value:\ - \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \ - \ requests:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ - \ - name: input-data\n persistentVolumeClaim:\n\ - \ claimName: {input_pvc_name}\n \ - \ - name: model\n persistentVolumeClaim:\n \ - \ claimName: {model_pvc_name}\n - name:\ - \ output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n Worker:\n \ - \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \ - \ template:\n metadata:\n annotations:\n\ - \ sidecar.istio.io/inject: 'false'\n spec:\n\ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /tmp/model;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model}\ - \ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ - \\\"\n resources:\n requests:\n\ + \ name: output\n resources:\n \ + \ requests:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n volumes:\n - name:\ + \ input-data\n persistentVolumeClaim:\n \ + \ claimName: {input_pvc_name}\n - name: model\n\ + \ persistentVolumeClaim:\n claimName:\ + \ {model_pvc_name}\n - name: output\n \ + \ persistentVolumeClaim:\n claimName: {output_pvc_name}\n\ + \ Worker:\n replicas: {nnodes-1}\n \ + \ restartPolicy: OnFailure\n template:\n metadata:\n\ + \ annotations:\n sidecar.istio.io/inject:\ + \ 'false'\n spec:\n containers:\n \ + \ - args:\n - |\n \ + \ mkdir -p /tmp/model;\n export TRITON_CACHE_DIR=/tmp\n\ + \ export XDG_CACHE_HOME=/tmp\n \ + \ export HF_HOME=/tmp\n export TRANSFORMERS_CACHE=/tmp\n\ + \ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\ + \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ + \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite\n command:\n\ + \ - /bin/bash\n - '-c'\n \ + \ - '--'\n image: {image}\n \ + \ name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ readOnly: true\n env:\n\ + \ - name: NNODES\n value:\ + \ \\\\\"{nnodes}\\\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\\\"{nproc_per_node}\\\\\"\n \ + \ resources:\n requests:\n \ + \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ \ cpu: 2\n \"nvidia.com/gpu\": {nproc_per_node}\n\ \ volumes:\n - name: input-data\n \ @@ -949,61 +961,73 @@ deploymentSpec: \ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\ \ inspect\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n\ \ name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n \ - \ image = \"quay.io/shanand/test-train:0.0.4\"\n\n manifest = inspect.cleandoc(\n\ - \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\ - \ metadata:\n name: {name}\n spec:\n nprocPerNode:\ - \ \\\\\"{nproc_per_node}\\\\\"\n pytorchReplicaSpecs:\n \ - \ Master:\n replicas: 1\n restartPolicy: OnFailure\n\ - \ template:\n metadata:\n annotations:\n\ - \ sidecar.istio.io/inject: 'false'\n spec:\n\ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /output/model;\n\ - \ mkdir -p /output/data;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\ - \ /output/model --data_output_dir /input_data/processed_data\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ + \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ + \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ + \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ + \\\"\n pytorchReplicaSpecs:\n Master:\n \ + \ replicas: 1\n restartPolicy: OnFailure\n template:\n\ + \ metadata:\n annotations:\n \ + \ sidecar.istio.io/inject: 'false'\n spec:\n \ + \ containers:\n - args:\n \ + \ - |\n mkdir -p /output/model;\n \ + \ mkdir -p /output/data;\n \ + \ export XDG_CACHE_HOME=/tmp\n export TRITON_CACHE_DIR=/tmp\n\ + \ export HF_HOME=/tmp\n \ + \ export TRANSFORMERS_CACHE=/tmp\n torchrun --nnodes\ + \ {nnodes} --nproc_per_node {nproc_per_node} --node_rank \\$(RANK) --rdzv_endpoint\ + \ \\$(MASTER_ADDR):\\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model}\ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ \ name: input-data\n readOnly:\ \ true\n - mountPath: /input_model\n \ \ name: model\n readOnly: true\n \ \ - mountPath: /output\n \ - \ name: output\n env:\n - name:\ - \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value:\ - \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \ - \ requests:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ - \ - name: input-data\n persistentVolumeClaim:\n\ - \ claimName: {input_pvc_name}\n \ - \ - name: model\n persistentVolumeClaim:\n \ - \ claimName: {model_pvc_name}\n - name:\ - \ output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n Worker:\n \ - \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \ - \ template:\n metadata:\n annotations:\n\ - \ sidecar.istio.io/inject: 'false'\n spec:\n\ - \ containers:\n - args:\n \ - \ - |\n mkdir -p /tmp/model;\n \ - \ python3.11 -u run_main_ds.py --model_path {path_to_model}\ - \ --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ - \\\"\n resources:\n requests:\n\ + \ name: output\n resources:\n \ + \ requests:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n volumes:\n - name:\ + \ input-data\n persistentVolumeClaim:\n \ + \ claimName: {input_pvc_name}\n - name: model\n\ + \ persistentVolumeClaim:\n claimName:\ + \ {model_pvc_name}\n - name: output\n \ + \ persistentVolumeClaim:\n claimName: {output_pvc_name}\n\ + \ Worker:\n replicas: {nnodes-1}\n \ + \ restartPolicy: OnFailure\n template:\n metadata:\n\ + \ annotations:\n sidecar.istio.io/inject:\ + \ 'false'\n spec:\n containers:\n \ + \ - args:\n - |\n \ + \ mkdir -p /tmp/model;\n export TRITON_CACHE_DIR=/tmp\n\ + \ export XDG_CACHE_HOME=/tmp\n \ + \ export HF_HOME=/tmp\n export TRANSFORMERS_CACHE=/tmp\n\ + \ torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node}\ + \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ + \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ + \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ + \ --sharding_strategy=FULL_SHARD --is_granite\n command:\n\ + \ - /bin/bash\n - '-c'\n \ + \ - '--'\n image: {image}\n \ + \ name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ readOnly: true\n env:\n\ + \ - name: NNODES\n value:\ + \ \\\\\"{nnodes}\\\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\\\"{nproc_per_node}\\\\\"\n \ + \ resources:\n requests:\n \ + \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ \ cpu: 2\n \"nvidia.com/gpu\": {nproc_per_node}\n\ \ volumes:\n - name: input-data\n \ diff --git a/training/components.py b/training/components.py index 4db6a0a..09e0be9 100644 --- a/training/components.py +++ b/training/components.py @@ -95,7 +95,7 @@ def pytorchjob_manifest_op( Outputs = NamedTuple("outputs", manifest=str, name=str) name = f"train-{phase_name}-{name_suffix.rstrip('-sdg')}" - image = "quay.io/shanand/test-train:0.0.4" + image = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989" manifest = inspect.cleandoc( f""" @@ -119,7 +119,11 @@ def pytorchjob_manifest_op( - | mkdir -p /output/model; mkdir -p /output/data; - python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /output/model --data_output_dir /input_data/processed_data + export XDG_CACHE_HOME=/tmp + export TRITON_CACHE_DIR=/tmp + export HF_HOME=/tmp + export TRANSFORMERS_CACHE=/tmp + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch command: - /bin/bash - '-c' @@ -135,11 +139,6 @@ def pytorchjob_manifest_op( readOnly: true - mountPath: /output name: output - env: - - name: NNODES - value: \\"{nnodes}\\" - - name: NPROC_PER_NODE - value: \\"{nproc_per_node}\\" resources: requests: cpu: 2 @@ -169,7 +168,11 @@ def pytorchjob_manifest_op( - args: - | mkdir -p /tmp/model; - python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir /tmp/model --data_output_dir /input_data/processed_data + export TRITON_CACHE_DIR=/tmp + export XDG_CACHE_HOME=/tmp + export HF_HOME=/tmp + export TRANSFORMERS_CACHE=/tmp + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite command: - /bin/bash - '-c'