fix: clarify the structure of the initial tarball

The tarball located on S3 must contains both SDG data and the model to train. Signed-off-by: Sébastien Han <seb@redhat.com>
redhat-et · Oct 9, 2024 · 42bcfb4 · 42bcfb4
1 parent a149ea9
commit 42bcfb4
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 57 deletions.
diff --git a/standalone/README.md b/standalone/README.md
@@ -88,7 +88,7 @@ The script requires information regarding the location and method for accessing
 * `--sdg-object-store-secret-key`: The secret key for the object store.
   `SDG_OBJECT_STORE_SECRET_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided.
 * `--sdg-object-store-data-key`: The key for the SDG data in the object store. e.g.,
-  `sdg.tar.gz`.`SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided.
+  `data.tar.gz`.`SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided.
 * `--sdg-object-store-verify-tls`: Whether to verify TLS for the object store endpoint (default:
   true). `SDG_OBJECT_STORE_VERIFY_TLS` environment variable can be used as well. **Optional**
 * `--sdg-object-store-region`: The region of the object store. `SDG_OBJECT_STORE_REGION` environment
@@ -107,17 +107,21 @@ The script requires information regarding the location and method for accessing
 The following example demonstrates how to generate SDG data, package it as a tarball, and upload it
 to an object store. This assumes that AWS CLI is installed and configured with the necessary
 credentials.
-In this scenario the name of the bucket is `sdg-data` and the tarball file is `sdg.tar.gz`.
+In this scenario the name of the bucket is `sdg-data` and the tarball file is `data.tar.gz`.
 
 ```bash
 ilab data generate
-cd generated
-tar -czvf sdg.tar.gz *
-aws cp sdg.tar.gz s3://sdg-data/sdg.tar.gz
+mv generated data
+tar -czvf data.tar.gz data model
+aws cp data.tar.gz s3://sdg-data/data.tar.gz
 ```
 
 > [!CAUTION]
-> Ensures SDG data is packaged as a tarball **without** top-level directories. So you must run `tar` inside the directory containing the SDG data.
+> Ensures SDG data are in a directory called "data" and the model is in a directory called "model".
+> The tarball must contain two top-level directories: `data` and `model`.
+
+> [!CAUTION]
+> Make sure the tarball format is .tar.gz.
 
 #### Alternative Method to AWS CLI
 
@@ -129,7 +133,7 @@ to upload the SDG data to the object store.
   --object-store-bucket sdg-data \
   --object-store-access-key $ACCESS_KEY \
   --object-store-secret-key $SECRET_KEY \
-  --sdg-data-archive-file-path sdg.tar.gz
+  --sdg-data-archive-file-path data.tar.gz
 ```
 
 Run `./sdg-data-on-s3.py upload --help` to see all available options.
@@ -140,7 +144,7 @@ The simplest method to supply the script with the required information for retri
 creating a Kubernetes secret. In the example below, we create a secret called `sdg-data` within the
 `my-namespace` namespace, containing the necessary credentials. Ensure that you update the access
 key and secret key as needed. The `data_key` field refers to the name of the tarball file in the
-object store that holds the SDG data. In this case, it's named `sdg.tar.gz`, as we previously
+object store that holds the SDG data. In this case, it's named `data.tar.gz`, as we previously
 uploaded the tarball to the object store using this name.
 
 ```bash
@@ -155,7 +159,7 @@ stringData:
   bucket: sdg-data
   access_key: *****
   secret_key: *****
-  data_key: sdg.tar.gz
+  data_key: data.tar.gz
 EOF
 
 ./standalone run \
@@ -203,7 +207,7 @@ Secret named `sdg-object-store-credentials` in the same namespace as the resourc
   --sdg-object-store-access-key key \
   --sdg-object-store-secret-key key \
   --sdg-object-store-bucket sdg-data \
-  --sdg-object-store-data-key sdg.tar.gz
+  --sdg-object-store-data-key data.tar.gz
 ```
 
 #### Advanced Configuration Using an S3-Compatible Object Store
@@ -219,7 +223,7 @@ If you don't use the official AWS S3 endpoint, you can provide additional inform
   --sdg-object-store-access-key key \
   --sdg-object-store-secret-key key \
   --sdg-object-store-bucket sdg-data \
-  --sdg-object-store-data-key sdg.tar.gz \
+  --sdg-object-store-data-key data.tar.gz \
   --sdg-object-store-verify-tls false \
   --sdg-object-store-endpoint https://s3.openshift-storage.svc:443
 ```

diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -257,7 +257,7 @@ def download_s3_file():
 
     bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET')
     s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY')
-    output_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz'
+    output_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz'
 
     s3.download_file(bucket_name, s3_key, output_file)
 
@@ -266,7 +266,7 @@ def upload_s3_file():
 
     bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET')
     s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') # TODO: change the name for the model name
-    input_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' # TODO: change for model path
+    input_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' # TODO: change for model path
 
     s3.upload_file(input_file, bucket_name, s3_key)
 
@@ -283,9 +283,29 @@ def upload_s3_file():
 
 python "$tmp"/download_s3.py
 
-if [[ "$STRATEGY" == "download" ]]; then
+if [ "$STRATEGY" == "download" ]; then
+  # List top-level directories only (no nested directories)
+  top_level_dirs=$(tar --exclude='*/*' --list --file {MODEL_PVC_MOUNT_PATH}/data.tar.gz)
+
+  # List of directories we expect in the archive
+  expected_dirs=("data" "model")
+
+  # Loop through the expected directories and check if they exist in the archive
+  for dir in "${expected_dirs[@]}"; do
+    if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then
+      echo "Archive does not contain a '$dir' directory"
+      exit 1
+    fi
+  done
+  echo "All expected directories are present."
+
+  # First extract SDG data in the SDG PVC
   mkdir -p {SDG_PVC_MOUNT_PATH}/generated
-  tar -xvf {SDG_PVC_MOUNT_PATH}/sdg.tar.gz -C {SDG_PVC_MOUNT_PATH}/generated
+  tar -C {SDG_PVC_MOUNT_PATH}/generated -xf data.tar.gz --strip-components=1 data/
+
+  # Then extract the model in the model PVC
+  mkdir -p {MODEL_PVC_MOUNT_PATH}/model
+  tar -C {MODEL_PVC_MOUNT_PATH} -xf {MODEL_PVC_MOUNT_PATH}/data.tar.gz --strip-components=1 model/
 fi
 """
 
@@ -568,9 +588,11 @@ def show(
     "--sdg-object-store-data-key",
     envvar="SDG_OBJECT_STORE_DATA_KEY",
     help=(
-        "Name of tarball that contains SDG data. (SDG_OBJECT_STORE_DATA_KEY env var)."
-        "The tarball MUST NOT contain a top-level directory. "
-        "To archive your SDG data, use the following command: cd /path/to/data && tar -czvf sdg.tar.gz *"
+        "Name of tarball that contains SDG data AND model files. (SDG_OBJECT_STORE_DATA_KEY env var)."
+        "The tarball MUST contain two directories: data and model."
+        "The data directory contains the SDG data."
+        "The model directory contains the model to train."
+        "To archive , use the following command: tar -czvf data.tar.gz /path/to/data /path/to/model ."
     ),
     type=str,
 )
@@ -736,6 +758,20 @@ def get_sdg_vol_mount() -> kubernetes.client.V1VolumeMount:
     ]
 
 
+def get_fetch_sdg_vol_mount() -> kubernetes.client.V1VolumeMount:
+    """
+    Get the volume mount for the SDG job.
+    """
+    return [
+        kubernetes.client.V1VolumeMount(
+            name=SDG_VOLUME_NAME, mount_path=SDG_PVC_MOUNT_PATH
+        ),
+        kubernetes.client.V1VolumeMount(
+            name=MODEL_VOLUME_NAME, mount_path=MODEL_PVC_MOUNT_PATH
+        ),
+    ]
+
+
 def create_sdg_job(
     namespace: str,
     job_name: str,
@@ -1057,10 +1093,11 @@ def create_sdg_data_fetch_job(
         command=["/bin/sh", "-c"],
         args=[
             SDG_DATA_SCRIPT.format(
-                strategy="download", SDG_PVC_MOUNT_PATH=SDG_PVC_MOUNT_PATH
+                strategy="download",
+                MODEL_PVC_MOUNT_PATH=MODEL_PVC_MOUNT_PATH,  # TODO: DOWNLOAD ON THE MODEL PVC!!
             )
         ],
-        volume_mounts=get_sdg_vol_mount(),
+        volume_mounts=get_fetch_sdg_vol_mount(),
         env=[
             kubernetes.client.V1EnvVar(
                 name="SDG_OBJECT_STORE_ENDPOINT",
@@ -1110,6 +1147,14 @@ def create_sdg_data_fetch_job(
                     )
                 ),
             ),
+            kubernetes.client.V1EnvVar(
+                name="SDG_OBJECT_STORE_MODEL_KEY",
+                value_from=kubernetes.client.V1EnvVarSource(
+                    secret_key_ref=kubernetes.client.V1SecretKeySelector(
+                        name=sdg_object_store_secret, key="model_key", optional=False
+                    )
+                ),
+            ),
             kubernetes.client.V1EnvVar(
                 name="SDG_OBJECT_STORE_VERIFY_TLS",
                 value_from=kubernetes.client.V1EnvVarSource(
@@ -1134,12 +1179,6 @@ def create_sdg_data_fetch_job(
                 claim_name=MODEL_PVC_NAME
             ),
         ),
-        kubernetes.client.V1Volume(
-            name=TRAINING_VOLUME_NAME,
-            persistent_volume_claim=kubernetes.client.V1PersistentVolumeClaimVolumeSource(
-                claim_name=TRAINING_PVC_NAME
-            ),
-        ),
     ]
 
     # Create and configure a spec section
@@ -1814,15 +1853,8 @@ def decode_base64(data):
             "name": MODEL_PVC_NAME,
             "namespace": namespace,
             "storage_class": storage_class,
-            "access_modes": ["ReadWriteOnce"],
-            "size": "100Gi",  # Model can be big so let's go with a safe size
-        },
-        {
-            "name": TRAINING_PVC_NAME,
-            "namespace": namespace,
-            "storage_class": storage_class,
             "access_modes": ["ReadWriteMany"],
-            "size": "100Gi",  # Training data can be big so let's go with a safe size
+            "size": "100Gi",  # Model can be big so let's go with a safe size
         },
     ]
     for pvc in pvcs:

diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl
@@ -242,7 +242,7 @@ def download_s3_file():
 
     bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET')
     s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY')
-    output_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz'
+    output_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz'
 
     s3.download_file(bucket_name, s3_key, output_file)
 
@@ -251,7 +251,7 @@ def upload_s3_file():
 
     bucket_name = os.getenv('SDG_OBJECT_STORE_BUCKET')
     s3_key = os.getenv('SDG_OBJECT_STORE_DATA_KEY') # TODO: change the name for the model name
-    input_file = '{SDG_PVC_MOUNT_PATH}/sdg.tar.gz' # TODO: change for model path
+    input_file = '{MODEL_PVC_MOUNT_PATH}/data.tar.gz' # TODO: change for model path
 
     s3.upload_file(input_file, bucket_name, s3_key)
 
@@ -268,9 +268,29 @@ EOF
 
 python "$tmp"/download_s3.py
 
-if [[ "$STRATEGY" == "download" ]]; then
+if [ "$STRATEGY" == "download" ]; then
+  # List top-level directories only (no nested directories)
+  top_level_dirs=$(tar --exclude='*/*' --list --file {MODEL_PVC_MOUNT_PATH}/data.tar.gz)
+
+  # List of directories we expect in the archive
+  expected_dirs=("data" "model")
+
+  # Loop through the expected directories and check if they exist in the archive
+  for dir in "${expected_dirs[@]}"; do
+    if ! echo "$top_level_dirs" | grep -q "^$dir/$"; then
+      echo "Archive does not contain a '$dir' directory"
+      exit 1
+    fi
+  done
+  echo "All expected directories are present."
+
+  # First extract SDG data in the SDG PVC
   mkdir -p {SDG_PVC_MOUNT_PATH}/generated
-  tar -xvf {SDG_PVC_MOUNT_PATH}/sdg.tar.gz -C {SDG_PVC_MOUNT_PATH}/generated
+  tar -C {SDG_PVC_MOUNT_PATH}/generated -xf data.tar.gz --strip-components=1 data/
+
+  # Then extract the model in the model PVC
+  mkdir -p {MODEL_PVC_MOUNT_PATH}/model
+  tar -C {MODEL_PVC_MOUNT_PATH} -xf {MODEL_PVC_MOUNT_PATH}/data.tar.gz --strip-components=1 model/
 fi
 """
 
@@ -553,9 +573,11 @@ def show(
     "--sdg-object-store-data-key",
     envvar="SDG_OBJECT_STORE_DATA_KEY",
     help=(
-        "Name of tarball that contains SDG data. (SDG_OBJECT_STORE_DATA_KEY env var)."
-        "The tarball MUST NOT contain a top-level directory. "
-        "To archive your SDG data, use the following command: cd /path/to/data && tar -czvf sdg.tar.gz *"
+        "Name of tarball that contains SDG data AND model files. (SDG_OBJECT_STORE_DATA_KEY env var)."
+        "The tarball MUST contain two directories: data and model."
+        "The data directory contains the SDG data."
+        "The model directory contains the model to train."
+        "To archive , use the following command: tar -czvf data.tar.gz /path/to/data /path/to/model ."
     ),
     type=str,
 )
@@ -721,6 +743,20 @@ def get_sdg_vol_mount() -> kubernetes.client.V1VolumeMount:
     ]
 
 
+def get_fetch_sdg_vol_mount() -> kubernetes.client.V1VolumeMount:
+    """
+    Get the volume mount for the SDG job.
+    """
+    return [
+        kubernetes.client.V1VolumeMount(
+            name=SDG_VOLUME_NAME, mount_path=SDG_PVC_MOUNT_PATH
+        ),
+        kubernetes.client.V1VolumeMount(
+            name=MODEL_VOLUME_NAME, mount_path=MODEL_PVC_MOUNT_PATH
+        ),
+    ]
+
+
 def create_sdg_job(
     namespace: str,
     job_name: str,
@@ -933,10 +969,11 @@ def create_sdg_data_fetch_job(
         command=["/bin/sh", "-c"],
         args=[
             SDG_DATA_SCRIPT.format(
-                strategy="download", SDG_PVC_MOUNT_PATH=SDG_PVC_MOUNT_PATH
+                strategy="download",
+                MODEL_PVC_MOUNT_PATH=MODEL_PVC_MOUNT_PATH,  # TODO: DOWNLOAD ON THE MODEL PVC!!
             )
         ],
-        volume_mounts=get_sdg_vol_mount(),
+        volume_mounts=get_fetch_sdg_vol_mount(),
         env=[
             kubernetes.client.V1EnvVar(
                 name="SDG_OBJECT_STORE_ENDPOINT",
@@ -986,6 +1023,14 @@ def create_sdg_data_fetch_job(
                     )
                 ),
             ),
+            kubernetes.client.V1EnvVar(
+                name="SDG_OBJECT_STORE_MODEL_KEY",
+                value_from=kubernetes.client.V1EnvVarSource(
+                    secret_key_ref=kubernetes.client.V1SecretKeySelector(
+                        name=sdg_object_store_secret, key="model_key", optional=False
+                    )
+                ),
+            ),
             kubernetes.client.V1EnvVar(
                 name="SDG_OBJECT_STORE_VERIFY_TLS",
                 value_from=kubernetes.client.V1EnvVarSource(
@@ -1010,12 +1055,6 @@ def create_sdg_data_fetch_job(
                 claim_name=MODEL_PVC_NAME
             ),
         ),
-        kubernetes.client.V1Volume(
-            name=TRAINING_VOLUME_NAME,
-            persistent_volume_claim=kubernetes.client.V1PersistentVolumeClaimVolumeSource(
-                claim_name=TRAINING_PVC_NAME
-            ),
-        ),
     ]
 
     # Create and configure a spec section
@@ -1585,15 +1624,8 @@ def sdg_data_fetch(
             "name": MODEL_PVC_NAME,
             "namespace": namespace,
             "storage_class": storage_class,
-            "access_modes": ["ReadWriteOnce"],
-            "size": "100Gi",  # Model can be big so let's go with a safe size
-        },
-        {
-            "name": TRAINING_PVC_NAME,
-            "namespace": namespace,
-            "storage_class": storage_class,
             "access_modes": ["ReadWriteMany"],
-            "size": "100Gi",  # Training data can be big so let's go with a safe size
+            "size": "100Gi",  # Model can be big so let's go with a safe size
         },
     ]
     for pvc in pvcs: