From 42bcfb44a9c410571ee8dcd2a14fdb9a7957d18d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 9 Oct 2024 14:40:27 +0200 Subject: [PATCH] fix: clarify the structure of the initial tarball MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tarball located on S3 must contains both SDG data and the model to train. Signed-off-by: Sébastien Han --- standalone/README.md | 26 +++++++------ standalone/standalone.py | 78 +++++++++++++++++++++++++++------------ standalone/standalone.tpl | 78 +++++++++++++++++++++++++++------------ 3 files changed, 125 insertions(+), 57 deletions(-) diff --git a/standalone/README.md b/standalone/README.md index 5e2ea9c..9a5cbb0 100644 --- a/standalone/README.md +++ b/standalone/README.md @@ -88,7 +88,7 @@ The script requires information regarding the location and method for accessing * `--sdg-object-store-secret-key`: The secret key for the object store. `SDG_OBJECT_STORE_SECRET_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. * `--sdg-object-store-data-key`: The key for the SDG data in the object store. e.g., - `sdg.tar.gz`.`SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. + `data.tar.gz`.`SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. * `--sdg-object-store-verify-tls`: Whether to verify TLS for the object store endpoint (default: true). `SDG_OBJECT_STORE_VERIFY_TLS` environment variable can be used as well. **Optional** * `--sdg-object-store-region`: The region of the object store. `SDG_OBJECT_STORE_REGION` environment @@ -107,17 +107,21 @@ The script requires information regarding the location and method for accessing The following example demonstrates how to generate SDG data, package it as a tarball, and upload it to an object store. This assumes that AWS CLI is installed and configured with the necessary credentials. -In this scenario the name of the bucket is `sdg-data` and the tarball file is `sdg.tar.gz`. +In this scenario the name of the bucket is `sdg-data` and the tarball file is `data.tar.gz`. ```bash ilab data generate -cd generated -tar -czvf sdg.tar.gz * -aws cp sdg.tar.gz s3://sdg-data/sdg.tar.gz +mv generated data +tar -czvf data.tar.gz data model +aws cp data.tar.gz s3://sdg-data/data.tar.gz ``` > [!CAUTION] -> Ensures SDG data is packaged as a tarball **without** top-level directories. So you must run `tar` inside the directory containing the SDG data. +> Ensures SDG data are in a directory called "data" and the model is in a directory called "model". +> The tarball must contain two top-level directories: `data` and `model`. + +> [!CAUTION] +> Make sure the tarball format is .tar.gz. #### Alternative Method to AWS CLI @@ -129,7 +133,7 @@ to upload the SDG data to the object store. --object-store-bucket sdg-data \ --object-store-access-key $ACCESS_KEY \ --object-store-secret-key $SECRET_KEY \ - --sdg-data-archive-file-path sdg.tar.gz + --sdg-data-archive-file-path data.tar.gz ``` Run `./sdg-data-on-s3.py upload --help` to see all available options. @@ -140,7 +144,7 @@ The simplest method to supply the script with the required information for retri creating a Kubernetes secret. In the example below, we create a secret called `sdg-data` within the `my-namespace` namespace, containing the necessary credentials. Ensure that you update the access key and secret key as needed. The `data_key` field refers to the name of the tarball file in the -object store that holds the SDG data. In this case, it's named `sdg.tar.gz`, as we previously +object store that holds the SDG data. In this case, it's named `data.tar.gz`, as we previously uploaded the tarball to the object store using this name. ```bash @@ -155,7 +159,7 @@ stringData: bucket: sdg-data access_key: ***** secret_key: ***** - data_key: sdg.tar.gz + data_key: data.tar.gz EOF ./standalone run \ @@ -203,7 +207,7 @@ Secret named `sdg-object-store-credentials` in the same namespace as the resourc --sdg-object-store-access-key key \ --sdg-object-store-secret-key key \ --sdg-object-store-bucket sdg-data \ - --sdg-object-store-data-key sdg.tar.gz + --sdg-object-store-data-key data.tar.gz ``` #### Advanced Configuration Using an S3-Compatible Object Store @@ -219,7 +223,7 @@ If you don't use the official AWS S3 endpoint, you can provide additional inform --sdg-object-store-access-key key \ --sdg-object-store-secret-key key \ --sdg-object-store-bucket sdg-data \ - --sdg-object-store-data-key sdg.tar.gz \ + --sdg-object-store-data-key data.tar.gz \ --sdg-object-store-verify-tls false \ --sdg-object-store-endpoint https://s3.openshift-storage.svc:443 ``` diff --git a/standalone/standalone.py b/standalone/standalone.py index f1c7484..d1c216f 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -257,7 +257,7 @@ def download_s3_file(): bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET') s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') - output_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' + output_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' s3.download_file(bucket_name, s3_key, output_file) @@ -266,7 +266,7 @@ def upload_s3_file(): bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET') s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') # TODO: change the name for the model name - input_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' # TODO: change for model path + input_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' # TODO: change for model path s3.upload_file(input_file, bucket_name, s3_key) @@ -283,9 +283,29 @@ def upload_s3_file(): python "$tmp"/download_s3.py -if [[ "$STRATEGY" == "download" ]]; then +if [ "$STRATEGY" == "download" ]; then + # List top-level directories only (no nested directories) + top_level_dirs=$(tar --exclude='*/*' --list --file {MODEL_PVC_MOUNT_PATH}/data.tar.gz) + + # List of directories we expect in the archive + expected_dirs=("data" "model") + + # Loop through the expected directories and check if they exist in the archive + for dir in "${expected_dirs[@]}"; do + if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then + echo "Archive does not contain a '$dir' directory" + exit 1 + fi + done + echo "All expected directories are present." + + # First extract SDG data in the SDG PVC mkdir -p {SDG_PVC_MOUNT_PATH}/generated - tar -xvf {SDG_PVC_MOUNT_PATH}/sdg.tar.gz -C {SDG_PVC_MOUNT_PATH}/generated + tar -C {SDG_PVC_MOUNT_PATH}/generated -xf data.tar.gz --strip-components=1 data/ + + # Then extract the model in the model PVC + mkdir -p {MODEL_PVC_MOUNT_PATH}/model + tar -C {MODEL_PVC_MOUNT_PATH} -xf {MODEL_PVC_MOUNT_PATH}/data.tar.gz --strip-components=1 model/ fi """ @@ -568,9 +588,11 @@ def show( "--sdg-object-store-data-key", envvar="SDG_OBJECT_STORE_DATA_KEY", help=( - "Name of tarball that contains SDG data. (SDG_OBJECT_STORE_DATA_KEY env var)." - "The tarball MUST NOT contain a top-level directory. " - "To archive your SDG data, use the following command: cd /path/to/data && tar -czvf sdg.tar.gz *" + "Name of tarball that contains SDG data AND model files. (SDG_OBJECT_STORE_DATA_KEY env var)." + "The tarball MUST contain two directories: data and model." + "The data directory contains the SDG data." + "The model directory contains the model to train." + "To archive , use the following command: tar -czvf data.tar.gz /path/to/data /path/to/model ." ), type=str, ) @@ -736,6 +758,20 @@ def get_sdg_vol_mount() -> kubernetes.client.V1VolumeMount: ] +def get_fetch_sdg_vol_mount() -> kubernetes.client.V1VolumeMount: + """ + Get the volume mount for the SDG job. + """ + return [ + kubernetes.client.V1VolumeMount( + name=SDG_VOLUME_NAME, mount_path=SDG_PVC_MOUNT_PATH + ), + kubernetes.client.V1VolumeMount( + name=MODEL_VOLUME_NAME, mount_path=MODEL_PVC_MOUNT_PATH + ), + ] + + def create_sdg_job( namespace: str, job_name: str, @@ -1057,10 +1093,11 @@ def create_sdg_data_fetch_job( command=["/bin/sh", "-c"], args=[ SDG_DATA_SCRIPT.format( - strategy="download", SDG_PVC_MOUNT_PATH=SDG_PVC_MOUNT_PATH + strategy="download", + MODEL_PVC_MOUNT_PATH=MODEL_PVC_MOUNT_PATH, # TODO: DOWNLOAD ON THE MODEL PVC!! ) ], - volume_mounts=get_sdg_vol_mount(), + volume_mounts=get_fetch_sdg_vol_mount(), env=[ kubernetes.client.V1EnvVar( name="SDG_OBJECT_STORE_ENDPOINT", @@ -1110,6 +1147,14 @@ def create_sdg_data_fetch_job( ) ), ), + kubernetes.client.V1EnvVar( + name="SDG_OBJECT_STORE_MODEL_KEY", + value_from=kubernetes.client.V1EnvVarSource( + secret_key_ref=kubernetes.client.V1SecretKeySelector( + name=sdg_object_store_secret, key="model_key", optional=False + ) + ), + ), kubernetes.client.V1EnvVar( name="SDG_OBJECT_STORE_VERIFY_TLS", value_from=kubernetes.client.V1EnvVarSource( @@ -1134,12 +1179,6 @@ def create_sdg_data_fetch_job( claim_name=MODEL_PVC_NAME ), ), - kubernetes.client.V1Volume( - name=TRAINING_VOLUME_NAME, - persistent_volume_claim=kubernetes.client.V1PersistentVolumeClaimVolumeSource( - claim_name=TRAINING_PVC_NAME - ), - ), ] # Create and configure a spec section @@ -1814,15 +1853,8 @@ def decode_base64(data): "name": MODEL_PVC_NAME, "namespace": namespace, "storage_class": storage_class, - "access_modes": ["ReadWriteOnce"], - "size": "100Gi", # Model can be big so let's go with a safe size - }, - { - "name": TRAINING_PVC_NAME, - "namespace": namespace, - "storage_class": storage_class, "access_modes": ["ReadWriteMany"], - "size": "100Gi", # Training data can be big so let's go with a safe size + "size": "100Gi", # Model can be big so let's go with a safe size }, ] for pvc in pvcs: diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index b0d5042..45e9712 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -242,7 +242,7 @@ def download_s3_file(): bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET') s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') - output_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' + output_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' s3.download_file(bucket_name, s3_key, output_file) @@ -251,7 +251,7 @@ def upload_s3_file(): bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET') s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') # TODO: change the name for the model name - input_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' # TODO: change for model path + input_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' # TODO: change for model path s3.upload_file(input_file, bucket_name, s3_key) @@ -268,9 +268,29 @@ EOF python "$tmp"/download_s3.py -if [[ "$STRATEGY" == "download" ]]; then +if [ "$STRATEGY" == "download" ]; then + # List top-level directories only (no nested directories) + top_level_dirs=$(tar --exclude='*/*' --list --file {MODEL_PVC_MOUNT_PATH}/data.tar.gz) + + # List of directories we expect in the archive + expected_dirs=("data" "model") + + # Loop through the expected directories and check if they exist in the archive + for dir in "${expected_dirs[@]}"; do + if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then + echo "Archive does not contain a '$dir' directory" + exit 1 + fi + done + echo "All expected directories are present." + + # First extract SDG data in the SDG PVC mkdir -p {SDG_PVC_MOUNT_PATH}/generated - tar -xvf {SDG_PVC_MOUNT_PATH}/sdg.tar.gz -C {SDG_PVC_MOUNT_PATH}/generated + tar -C {SDG_PVC_MOUNT_PATH}/generated -xf data.tar.gz --strip-components=1 data/ + + # Then extract the model in the model PVC + mkdir -p {MODEL_PVC_MOUNT_PATH}/model + tar -C {MODEL_PVC_MOUNT_PATH} -xf {MODEL_PVC_MOUNT_PATH}/data.tar.gz --strip-components=1 model/ fi """ @@ -553,9 +573,11 @@ def show( "--sdg-object-store-data-key", envvar="SDG_OBJECT_STORE_DATA_KEY", help=( - "Name of tarball that contains SDG data. (SDG_OBJECT_STORE_DATA_KEY env var)." - "The tarball MUST NOT contain a top-level directory. " - "To archive your SDG data, use the following command: cd /path/to/data && tar -czvf sdg.tar.gz *" + "Name of tarball that contains SDG data AND model files. (SDG_OBJECT_STORE_DATA_KEY env var)." + "The tarball MUST contain two directories: data and model." + "The data directory contains the SDG data." + "The model directory contains the model to train." + "To archive , use the following command: tar -czvf data.tar.gz /path/to/data /path/to/model ." ), type=str, ) @@ -721,6 +743,20 @@ def get_sdg_vol_mount() -> kubernetes.client.V1VolumeMount: ] +def get_fetch_sdg_vol_mount() -> kubernetes.client.V1VolumeMount: + """ + Get the volume mount for the SDG job. + """ + return [ + kubernetes.client.V1VolumeMount( + name=SDG_VOLUME_NAME, mount_path=SDG_PVC_MOUNT_PATH + ), + kubernetes.client.V1VolumeMount( + name=MODEL_VOLUME_NAME, mount_path=MODEL_PVC_MOUNT_PATH + ), + ] + + def create_sdg_job( namespace: str, job_name: str, @@ -933,10 +969,11 @@ def create_sdg_data_fetch_job( command=["/bin/sh", "-c"], args=[ SDG_DATA_SCRIPT.format( - strategy="download", SDG_PVC_MOUNT_PATH=SDG_PVC_MOUNT_PATH + strategy="download", + MODEL_PVC_MOUNT_PATH=MODEL_PVC_MOUNT_PATH, # TODO: DOWNLOAD ON THE MODEL PVC!! ) ], - volume_mounts=get_sdg_vol_mount(), + volume_mounts=get_fetch_sdg_vol_mount(), env=[ kubernetes.client.V1EnvVar( name="SDG_OBJECT_STORE_ENDPOINT", @@ -986,6 +1023,14 @@ def create_sdg_data_fetch_job( ) ), ), + kubernetes.client.V1EnvVar( + name="SDG_OBJECT_STORE_MODEL_KEY", + value_from=kubernetes.client.V1EnvVarSource( + secret_key_ref=kubernetes.client.V1SecretKeySelector( + name=sdg_object_store_secret, key="model_key", optional=False + ) + ), + ), kubernetes.client.V1EnvVar( name="SDG_OBJECT_STORE_VERIFY_TLS", value_from=kubernetes.client.V1EnvVarSource( @@ -1010,12 +1055,6 @@ def create_sdg_data_fetch_job( claim_name=MODEL_PVC_NAME ), ), - kubernetes.client.V1Volume( - name=TRAINING_VOLUME_NAME, - persistent_volume_claim=kubernetes.client.V1PersistentVolumeClaimVolumeSource( - claim_name=TRAINING_PVC_NAME - ), - ), ] # Create and configure a spec section @@ -1585,15 +1624,8 @@ def sdg_data_fetch( "name": MODEL_PVC_NAME, "namespace": namespace, "storage_class": storage_class, - "access_modes": ["ReadWriteOnce"], - "size": "100Gi", # Model can be big so let's go with a safe size - }, - { - "name": TRAINING_PVC_NAME, - "namespace": namespace, - "storage_class": storage_class, "access_modes": ["ReadWriteMany"], - "size": "100Gi", # Training data can be big so let's go with a safe size + "size": "100Gi", # Model can be big so let's go with a safe size }, ] for pvc in pvcs: