From d012b037ecacac4401e49fac7d89e5c66480a106 Mon Sep 17 00:00:00 2001 From: skordas Date: Wed, 16 Oct 2019 12:33:18 -0400 Subject: [PATCH 1/4] Adding Concurrent Jobs with ConfigMaps workload --- .../concurrent-jobs-with-configmaps.yaml | 131 ++++++++++++++++++ ...urrent-jobs-with-configmaps-script-cm.yaml | 68 +++++++++ .../vars/concurrent-jobs-with-configmaps.yaml | 29 ++++ 3 files changed, 228 insertions(+) create mode 100644 workloads/concurrent-jobs-with-configmaps.yaml create mode 100644 workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml create mode 100644 workloads/vars/concurrent-jobs-with-configmaps.yaml diff --git a/workloads/concurrent-jobs-with-configmaps.yaml b/workloads/concurrent-jobs-with-configmaps.yaml new file mode 100644 index 00000000..c4e0857d --- /dev/null +++ b/workloads/concurrent-jobs-with-configmaps.yaml @@ -0,0 +1,131 @@ +--- +# +# Runs concurrent jobs with configmaps benchmarks on existing cluster. +# + +- name: Runs concurrent jobs with configmaps + hosts: orchestration + gather_facts: true + remote_user: "{{orchestration_user}}" + vars_files: + - vars/concurrent-jobs-with-configmaps.yaml + vars: + workload_job: "concurrent-jobs" + tasks: + - name: Create scale-ci-tooling directory + file: + path: "{{ansible_user_dir}}/scale-ci-tooling" + state: directory + + - name: Copy workload files + copy: + src: "{{item.src}}" + dest: "{{item.dest}}" + with_items: + - src: scale-ci-tooling-ns.yml + dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml" + - src: workload-network-script-cm.yml + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-concurrent-jobs-with-configmaps-script-cm.yml" + + - name: Slurp kubeconfig file + slurp: + src: "{{kubeconfig_file}}" + register: kubeconfig_file_slurp + + - name: Slurp ssh private key file + slurp: + src: "{{pbench_ssh_private_key_file}}" + register: pbench_ssh_private_key_file_slurp + + - name: Slurp ssh public key file + slurp: + src: "{{pbench_ssh_public_key_file}}" + register: pbench_ssh_public_key_file_slurp + + - name: Template workload templates + template: + src: "{{item.src}}" + dest: "{{item.dest}}" + with_items: + - src: pbench-cm.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/pbench-cm.yml" + - src: pbench-ssh-secret.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/pbench-ssh-secret.yml" + - src: kubeconfig-secret.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/kubeconfig-secret.yml" + - src: workload-job.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml" + - src: workload-env.yml.j2 + dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-{{workload_job}}-env.yml" + + - name: Check if scale-ci-tooling namespace exists + shell: | + oc project scale-ci-tooling + ignore_errors: true + changed_when: false + register: scale_ci_tooling_ns_exists + + - name: Ensure any stale scale-ci-concurrent-jobs job is deleted + shell: | + oc delete job scale-ci-{{workload_job}} -n scale-ci-tooling + register: scale_ci_tooling_project + failed_when: scale_ci_tooling_project.rc == 0 + until: scale_ci_tooling_project.rc == 1 + retries: 60 + delay: 1 + when: scale_ci_tooling_ns_exists.rc == 0 + + - name: Block for non-existing tooling namespace + block: + - name: Create tooling namespace + shell: | + oc create -f {{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml + + - name: Create tooling service account + shell: | + oc create serviceaccount useroot -n scale-ci-tooling + oc adm policy add-scc-to-user privileged -z useroot -n scale-ci-tooling + when: enable_pbench_agents|bool or workload_job_privileged|bool + when: scale_ci_tooling_ns_exists.rc != 0 + + - name: Create/replace kubeconfig secret + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/kubeconfig-secret.yml" + + - name: Create/replace the pbench configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/pbench-cm.yml" + + - name: Create/replace pbench ssh secret + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/pbench-ssh-secret.yml" + + - name: Create/replace workload script configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-concurrent-jobs-with-configmaps-script-cm.yml" + + - name: Create/replace workload script environment configmap + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-{{workload_job}}-env.yml" + + - name: Create/replace workload job to that runs workload script + shell: | + oc replace --force -n scale-ci-tooling -f "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml" + + - name: Poll until job pod is running + shell: | + oc get pods --selector=job-name=scale-ci-{{workload_job}} -n scale-ci-tooling -o json + register: pod_json + retries: 60 + delay: 2 + until: pod_json.stdout | from_json | json_query('items[0].status.phase==`Running`') + + - name: Poll until job is complete + shell: | + oc get job scale-ci-{{workload_job}} -n scale-ci-tooling -o json + register: job_json + retries: "{{job_completion_poll_attempts}}" + delay: 10 + until: job_json.stdout | from_json | json_query('status.succeeded==`1` || status.failed==`1`') + failed_when: job_json.stdout | from_json | json_query('status.succeeded==`1`') == false + when: job_completion_poll_attempts|int > 0 \ No newline at end of file diff --git a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml new file mode 100644 index 00000000..d9cc20af --- /dev/null +++ b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml @@ -0,0 +1,68 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: scale-ci-workload-script +data: + run.sh: | + #!/bin/bash + jobs_amount=${NUMBER_OF_CONCURRENT_JOBS} + function create_jobs() + { + for i in $(seq 1 $jobs_amount); + do + cat /tmp/conc_jobs.yaml | sed "s/%JOB_ID%/$i/g" | oc create -f - + done + } + function wait_for_completion() + { + running=`oc get pods | grep -c Completed` + while [ $running -lt $jobs_amount ]; do + sleep 1 + running=`oc get pods | grep -E "Completed|OOMKilled" | wc -l` + echo "$running jobs are completed" + done + } + start_time=`date +%s` + create_jobs + wait_for_completion + end_time=`date +%s` + total_time=`echo $end_time - $start_time | bc` + echo "OOMKILLED jobs:" + oc get pods | grep OOMKilled + echo "Time taken for creating $jobs_amount concurrent jobs with configmaps $total_time" + conc_jobs.yaml: | + # Example from: https://github.com/kubernetes/kubernetes/issues/74412#issue-413387234 + --- + apiVersion: v1 + kind: ConfigMap + metadata: + name: job-%JOB_ID% + data: + game.properties: | + enemies=aliens + --- + apiVersion: batch/v1 + kind: Job + metadata: + name: job-%JOB_ID% + spec: + template: + spec: + containers: + - name: busybox + image: busybox + resources: + requests: + memory: "50Mi" + cpu: "10m" + command: [ "/bin/echo" ] + args: [ "Hello, World!" ] + volumeMounts: + - name: config-volume + mountPath: /etc/config + volumes: + - name: config-volume + configMap: + name: job-%JOB_ID% + restartPolicy: Never + backoffLimit: 4 \ No newline at end of file diff --git a/workloads/vars/concurrent-jobs-with-configmaps.yaml b/workloads/vars/concurrent-jobs-with-configmaps.yaml new file mode 100644 index 00000000..9c538397 --- /dev/null +++ b/workloads/vars/concurrent-jobs-with-configmaps.yaml @@ -0,0 +1,29 @@ +--- +############################################################################### +# Ansible SSH variables. +############################################################################### +ansible_public_key_file: "{{ lookup('env', 'PUBLIC_KEY')|default('~/.ssh/id_rsa.pub', true) }}" +ansible_private_key_file: "{{ lookup('env', 'PRIVATE_KEY')|default('~/.ssh/id_rsa', true) }}" + +orchestration_user: "{{ lookup('env', 'ORCHESTRATION_USER')|default('root', true) }}" +############################################################################### +# Conformance workload variables. +############################################################################### +workload_image: "{{ lookup('env', 'WORKLOAD_IMAGE')|default('quay.io/openshift-scale/scale-ci-workload', true) }}" + +workload_job_node_selector: "{{ lookup('env', 'WORKLOAD_JOB_NODE_SELECTOR')|default(false, true)|bool }}" +workload_job_taint: "{{ lookup('env', 'WORKLOAD_JOB_TAINT')|default(false, true)|bool }}" +workload_job_privileged: "{{ lookup('env', 'WORKLOAD_JOB_PRIVILEGED')|default(false, true)|bool }}" + +kubeconfig_file: "{{ lookup('env', 'KUBECONFIG_FILE')|default('~/.kube/config', true) }}" + +# pbench variables +pbench_instrumentation: "{{ lookup('env', 'PBENCH_INSTRUMENTATION')|default(false, true)|bool|lower }}" +enable_pbench_agents: "{{ lookup('env', 'ENABLE_PBENCH_AGENTS')|default(false, true)|bool }}" +enable_pbench_copy: "{{ lookup('env', 'ENABLE_PBENCH_COPY')|default(false, true)|bool|lower }}" +pbench_ssh_private_key_file: "{{ lookup('env', 'PBENCH_SSH_PRIVATE_KEY_FILE')|default('~/.ssh/id_rsa', true) }}" +pbench_ssh_public_key_file: "{{ lookup('env', 'PBENCH_SSH_PUBLIC_KEY_FILE')|default('~/.ssh/id_rsa.pub', true) }}" +pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" + +# Other variables for workload tests +number_of_concurrent_jobs: "{{ lookup('env', 'NUMBER_OF_CONCURRENT_JOBS')|default(300, true)|int }}" \ No newline at end of file From 936dd2990c847b54942f43804d453147bfab1e97 Mon Sep 17 00:00:00 2001 From: skordas Date: Thu, 17 Oct 2019 08:45:02 -0400 Subject: [PATCH 2/4] correct name of yaml file --- workloads/concurrent-jobs-with-configmaps.yaml | 4 ++-- .../workload-concurrent-jobs-with-configmaps-script-cm.yaml | 2 +- workloads/vars/concurrent-jobs-with-configmaps.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workloads/concurrent-jobs-with-configmaps.yaml b/workloads/concurrent-jobs-with-configmaps.yaml index c4e0857d..ad454931 100644 --- a/workloads/concurrent-jobs-with-configmaps.yaml +++ b/workloads/concurrent-jobs-with-configmaps.yaml @@ -24,7 +24,7 @@ with_items: - src: scale-ci-tooling-ns.yml dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml" - - src: workload-network-script-cm.yml + - src: workload-concurrent-jobs-with-configmaps-script-cm.yaml dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-concurrent-jobs-with-configmaps-script-cm.yml" - name: Slurp kubeconfig file @@ -128,4 +128,4 @@ delay: 10 until: job_json.stdout | from_json | json_query('status.succeeded==`1` || status.failed==`1`') failed_when: job_json.stdout | from_json | json_query('status.succeeded==`1`') == false - when: job_completion_poll_attempts|int > 0 \ No newline at end of file + when: job_completion_poll_attempts|int > 0 diff --git a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml index d9cc20af..a938c9a2 100644 --- a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml +++ b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml @@ -65,4 +65,4 @@ data: configMap: name: job-%JOB_ID% restartPolicy: Never - backoffLimit: 4 \ No newline at end of file + backoffLimit: 4 diff --git a/workloads/vars/concurrent-jobs-with-configmaps.yaml b/workloads/vars/concurrent-jobs-with-configmaps.yaml index 9c538397..6a3821a0 100644 --- a/workloads/vars/concurrent-jobs-with-configmaps.yaml +++ b/workloads/vars/concurrent-jobs-with-configmaps.yaml @@ -26,4 +26,4 @@ pbench_ssh_public_key_file: "{{ lookup('env', 'PBENCH_SSH_PUBLIC_KEY_FILE')|defa pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" # Other variables for workload tests -number_of_concurrent_jobs: "{{ lookup('env', 'NUMBER_OF_CONCURRENT_JOBS')|default(300, true)|int }}" \ No newline at end of file +number_of_concurrent_jobs: "{{ lookup('env', 'NUMBER_OF_CONCURRENT_JOBS')|default(300, true)|int }}" From 66546286b8e3edf2157af91b61a270d5eca984af Mon Sep 17 00:00:00 2001 From: skordas Date: Mon, 21 Oct 2019 14:24:38 -0400 Subject: [PATCH 3/4] Adding documentation, Small corrections and enhancements to run workload smoothly. --- docs/README.md | 70 +++++++++-------- docs/concurent-jobs-with-configmaps.md | 77 +++++++++++++++++++ ...ml => concurrent-jobs-with-configmaps.yml} | 13 +++- ...urrent-jobs-with-configmaps-script-cm.yml} | 10 ++- workloads/templates/workload-env.yml.j2 | 2 + ...ml => concurrent-jobs-with-configmaps.yml} | 3 +- 6 files changed, 135 insertions(+), 40 deletions(-) create mode 100644 docs/concurent-jobs-with-configmaps.md rename workloads/{concurrent-jobs-with-configmaps.yaml => concurrent-jobs-with-configmaps.yml} (92%) rename workloads/files/{workload-concurrent-jobs-with-configmaps-script-cm.yaml => workload-concurrent-jobs-with-configmaps-script-cm.yml} (82%) rename workloads/vars/{concurrent-jobs-with-configmaps.yaml => concurrent-jobs-with-configmaps.yml} (93%) diff --git a/docs/README.md b/docs/README.md index bb325d55..6f7524c8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,22 +1,23 @@ # Table of workloads -| Workload/tooling | Short Description | Minimum Requirements | -|:-------------------------------------------------- |:----------------------------------------- | ------------------------------------- | -| [Tooling](tooling.md) | Setup pbench instrumentation tools | Cluster-admin, Privileged Containers | -| [Test](test.md) | Test/Run your workload from ssh Container | Cluster-admin, Privileged Containers | -| [Baseline](baseline.md) | Baseline metrics capture | Tooling job* | -| [Scale](scale.md) | Scales worker nodes | Cluster-admin | -| [NodeVertical](nodevertical.md) | Node Kubelet Density | Labeling Nodes | -| [PodVertical](podvertical.md) | Max Pod Density | None | -| [MasterVertical](mastervertical.md) | Master Node Stress workload | None | -| [HTTP](http.md) | HTTP ingress TPS/Latency | None | -| [Network](network.md) | TCP/UDP Throughput/Latency | Labeling Nodes, [See below](#network) | -| [Deployments Per Namespace](deployments-per-ns.md) | Maximum Deployments | None | -| [PVCscale](pvscale.md) | PVCScale test | Working storageclass | -| [Conformance](conformance.md) | OCP/Kubernetes e2e tests | None | -| [Namespaces per cluster](namespaces-per-cluster.md) | Maximum Namespaces | None | -| [Services per namespace](services-per-namespace.md) | Maximum services per namespace | None | -| [FIO I/O test](fio.md) | FIO I/O test - stress storage backend | Privileged Containers, Working storage class | +| Workload/tooling | Short Description | Minimum Requirements | +|:------------------------------------------------------------------- |:----------------------------------------- | -------------------------------------------- | +| [Tooling](tooling.md) | Setup pbench instrumentation tools | Cluster-admin, Privileged Containers | +| [Test](test.md) | Test/Run your workload from ssh Container | Cluster-admin, Privileged Containers | +| [Baseline](baseline.md) | Baseline metrics capture | Tooling job* | +| [Scale](scale.md) | Scales worker nodes | Cluster-admin | +| [NodeVertical](nodevertical.md) | Node Kubelet Density | Labeling Nodes | +| [PodVertical](podvertical.md) | Max Pod Density | None | +| [MasterVertical](mastervertical.md) | Master Node Stress workload | None | +| [HTTP](http.md) | HTTP ingress TPS/Latency | None | +| [Network](network.md) | TCP/UDP Throughput/Latency | Labeling Nodes, [See below](#network) | +| [Deployments Per Namespace](deployments-per-ns.md) | Maximum Deployments | None | +| [PVCscale](pvscale.md) | PVCScale test | Working storageclass | +| [Conformance](conformance.md) | OCP/Kubernetes e2e tests | None | +| [Namespaces per cluster](namespaces-per-cluster.md) | Maximum Namespaces | None | +| [Services per namespace](services-per-namespace.md) | Maximum services per namespace | None | +| [FIO I/O test](fio.md) | FIO I/O test - stress storage backend | Privileged Containers, Working storage class | +| [Concurent jobs with configmaps](concurent-jobs-with-configmaps.md) | Create and run simple job | None | * Baseline job without a tooled cluster just idles a cluster. The goal is to capture resource consumption over a period of time to characterize resource requirements thus tooling is required. (For now) @@ -36,20 +37,21 @@ Each workload will implement a form of pass/fail criteria in order to flag if the tests have failed in CI. -| Workload/tooling | Pass/Fail | -|:-------------------------------------------------- |:----------------------------- | -| [Tooling](tooling.md) | NA | -| [Test](test.md) | NA | -| [Baseline](baseline.md) | NA | -| [Scale](scale.md) | Yes: Test Duration | -| [NodeVertical](nodevertical.md) | Yes: Exit Code, Test Duration | -| [PodVertical](podvertical.md) | Yes: Exit Code, Test Duration | -| [MasterVertical](mastervertical.md) | Yes: Exit Code, Test Duration | -| [HTTP](http.md) | No | -| [Network](network.md) | No | -| [Deployments Per Namespace](deployments-per-ns.md) | No | -| [PVCscale](pvscale.md) | No | -| [Conformance](conformance.md) | No | -| [Namespaces per cluster](namespaces-per-cluster.md) | Yes: Exit code, Test Duration | -| [Services per namespace](services-per-namespace.md) | Yes: Exit code, Test Duration | -| [FIO I/O test](fio.md) | No | +| Workload/tooling | Pass/Fail | +|:------------------------------------------------------------------- |:----------------------------- | +| [Tooling](tooling.md) | NA | +| [Test](test.md) | NA | +| [Baseline](baseline.md) | NA | +| [Scale](scale.md) | Yes: Test Duration | +| [NodeVertical](nodevertical.md) | Yes: Exit Code, Test Duration | +| [PodVertical](podvertical.md) | Yes: Exit Code, Test Duration | +| [MasterVertical](mastervertical.md) | Yes: Exit Code, Test Duration | +| [HTTP](http.md) | No | +| [Network](network.md) | No | +| [Deployments Per Namespace](deployments-per-ns.md) | No | +| [PVCscale](pvscale.md) | No | +| [Conformance](conformance.md) | No | +| [Namespaces per cluster](namespaces-per-cluster.md) | Yes: Exit code, Test Duration | +| [Services per namespace](services-per-namespace.md) | Yes: Exit code, Test Duration | +| [FIO I/O test](fio.md) | No | +| [Concurent jobs with configmaps](concurent-jobs-with-configmaps.md) | No | diff --git a/docs/concurent-jobs-with-configmaps.md b/docs/concurent-jobs-with-configmaps.md new file mode 100644 index 00000000..bea33542 --- /dev/null +++ b/docs/concurent-jobs-with-configmaps.md @@ -0,0 +1,77 @@ +# Concurrent Jobs With Configmaps Workload + +The Concurrent Jobs with Configmaps test playbook is `workloads/concurrent-jobs-with-configmaps.yaml` +This workload test is designed to check how many of ConfigMaps and pods can slowdown the cluster. + +```sh +$ cp workloads/inventory.example inventory +$ # Add orchestration host to inventory +$ # Edit vars in workloads/vars/concurrent-jobs-with-configmaps.yml or define Environment vars (See below) +$ time ansible-playbook -vv -i inventory workloads/concurrent-jobs-with-configmaps.yml +``` + +## Environment variables + +### PUBLIC_KEY +Default: `~/.ssh/id_rsa.pub` +Public ssh key file for Ansible. + +### PRIVATE_KEY +Default: `~/.ssh/id_rsa` +Private ssh key file for Ansible. + +### ORCHESTRATION_USER +Default: `root` +User for Ansible to log in as. Must authenticate with PUBLIC_KEY/PRIVATE_KEY. + +### WORKLOAD_IMAGE +Default: `quay.io/openshift-scale/scale-ci-workload` +Container image that runs the workload script. + +### WORKLOAD_JOB_NODE_SELECTOR +Default: `false` +Enables/disables the node selector that places the workload job on the `workload` node. + +### WORKLOAD_JOB_TAINT +Default: `false` +Enables/disables the toleration on the workload job to permit the `workload` taint. + +### WORKLOAD_JOB_PRIVILEGED +Default: `true` +Enables/disables running the workload pod as privileged. + +### KUBECONFIG_FILE +Default: `~/.kube/config` +Location of kubeconfig on orchestration host. + +### PBENCH_INSTRUMENTATION +Default: `false` +Enables/disables running the workload wrapped by pbench-user-benchmark. When enabled, pbench agents can then be enabled (`ENABLE_PBENCH_AGENTS`) for further instrumentation data and pbench-copy-results can be enabled (`ENABLE_PBENCH_COPY`) to export captured data for further analysis. + +### ENABLE_PBENCH_AGENTS +Default: `false` +Enables/disables the collection of pbench data on the pbench agent Pods. These Pods are deployed by the tooling playbook. + +### ENABLE_PBENCH_COPY +Default: `false` +Enables/disables the copying of pbench data to a remote results server for further analysis. + +### PBENCH_SSH_PRIVATE_KEY_FILE +Default: `~/.ssh/id_rsa` +Location of ssh private key to authenticate to the pbench results server. + +### PBENCH_SSH_PUBLIC_KEY_FILE +Default: `~/.ssh/id_rsa.pub` +Location of the ssh public key to authenticate to the pbench results server. + +### PBENCH_SERVER +Default: There is no public default. +DNS address of the pbench results server. + +### NUMBER_OF_CONCURRENT_JOBS +Default: 300 +Number of concurrent jobs with configmaps to create during workload. + +### JOB_COMPLETION_POLL_ATTEMPTS +Default: `360` +Number of retries for Ansible to poll if the workload job has completed. Poll attempts delay 10s between polls with some additional time taken for each polling action depending on the orchestration host setup. diff --git a/workloads/concurrent-jobs-with-configmaps.yaml b/workloads/concurrent-jobs-with-configmaps.yml similarity index 92% rename from workloads/concurrent-jobs-with-configmaps.yaml rename to workloads/concurrent-jobs-with-configmaps.yml index ad454931..e45f1752 100644 --- a/workloads/concurrent-jobs-with-configmaps.yaml +++ b/workloads/concurrent-jobs-with-configmaps.yml @@ -8,7 +8,7 @@ gather_facts: true remote_user: "{{orchestration_user}}" vars_files: - - vars/concurrent-jobs-with-configmaps.yaml + - vars/concurrent-jobs-with-configmaps.yml vars: workload_job: "concurrent-jobs" tasks: @@ -24,7 +24,7 @@ with_items: - src: scale-ci-tooling-ns.yml dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml" - - src: workload-concurrent-jobs-with-configmaps-script-cm.yaml + - src: workload-concurrent-jobs-with-configmaps-script-cm.yml dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-concurrent-jobs-with-configmaps-script-cm.yml" - name: Slurp kubeconfig file @@ -75,6 +75,15 @@ delay: 1 when: scale_ci_tooling_ns_exists.rc == 0 + - name: Ensure project concurrent-jobs-workload from previous workload is deleted + shell: | + oc delete project concurrent-jobs-workload + register: concurrent_jobs_workload_project + failed_when: concurrent_jobs_workload_project.rc == 0 + until: concurrent_jobs_workload_project.rc == 1 + retries: 60 + delay: 1 + - name: Block for non-existing tooling namespace block: - name: Create tooling namespace diff --git a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml similarity index 82% rename from workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml rename to workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml index a938c9a2..c72d19fb 100644 --- a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yaml +++ b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml @@ -10,7 +10,7 @@ data: { for i in $(seq 1 $jobs_amount); do - cat /tmp/conc_jobs.yaml | sed "s/%JOB_ID%/$i/g" | oc create -f - + cat /root/workload/conc_jobs.yaml | sed "s/%JOB_ID%/$i/g" | oc create -f - done } function wait_for_completion() @@ -18,10 +18,12 @@ data: running=`oc get pods | grep -c Completed` while [ $running -lt $jobs_amount ]; do sleep 1 - running=`oc get pods | grep -E "Completed|OOMKilled" | wc -l` + running=`oc get pods -n concurrent-jobs-workload | grep -E "Completed|OOMKilled" | wc -l` echo "$running jobs are completed" done } + + oc new-project concurrent-jobs-workload start_time=`date +%s` create_jobs wait_for_completion @@ -29,7 +31,7 @@ data: total_time=`echo $end_time - $start_time | bc` echo "OOMKILLED jobs:" oc get pods | grep OOMKilled - echo "Time taken for creating $jobs_amount concurrent jobs with configmaps $total_time" + echo "Time taken for creating $jobs_amount concurrent jobs with configmaps $total_time seconds" conc_jobs.yaml: | # Example from: https://github.com/kubernetes/kubernetes/issues/74412#issue-413387234 --- @@ -37,6 +39,7 @@ data: kind: ConfigMap metadata: name: job-%JOB_ID% + namespace: concurrent-jobs-workload data: game.properties: | enemies=aliens @@ -45,6 +48,7 @@ data: kind: Job metadata: name: job-%JOB_ID% + namespace: concurrent-jobs-workload spec: template: spec: diff --git a/workloads/templates/workload-env.yml.j2 b/workloads/templates/workload-env.yml.j2 index 40b25d18..068cfb96 100644 --- a/workloads/templates/workload-env.yml.j2 +++ b/workloads/templates/workload-env.yml.j2 @@ -103,4 +103,6 @@ data: PROMETHEUS_GRAPH_PERIOD: "{{prometheus_graph_period}}" PROMETHEUS_REFRESH_INTERVAL: "{{prometheus_refresh_interval}}" PROMETHEUS_SCALE_TEST_PREFIX: "{{prometheus_scale_test_prefix}}" +{% elif workload_job == "concurrent-jobs" %} + NUMBER_OF_CONCURRENT_JOBS: "{{number_of_concurrent_jobs}}" {% endif %} diff --git a/workloads/vars/concurrent-jobs-with-configmaps.yaml b/workloads/vars/concurrent-jobs-with-configmaps.yml similarity index 93% rename from workloads/vars/concurrent-jobs-with-configmaps.yaml rename to workloads/vars/concurrent-jobs-with-configmaps.yml index 6a3821a0..10900514 100644 --- a/workloads/vars/concurrent-jobs-with-configmaps.yaml +++ b/workloads/vars/concurrent-jobs-with-configmaps.yml @@ -13,7 +13,7 @@ workload_image: "{{ lookup('env', 'WORKLOAD_IMAGE')|default('quay.io/openshift-s workload_job_node_selector: "{{ lookup('env', 'WORKLOAD_JOB_NODE_SELECTOR')|default(false, true)|bool }}" workload_job_taint: "{{ lookup('env', 'WORKLOAD_JOB_TAINT')|default(false, true)|bool }}" -workload_job_privileged: "{{ lookup('env', 'WORKLOAD_JOB_PRIVILEGED')|default(false, true)|bool }}" +workload_job_privileged: "{{ lookup('env', 'WORKLOAD_JOB_PRIVILEGED')|default(true, true)|bool }}" kubeconfig_file: "{{ lookup('env', 'KUBECONFIG_FILE')|default('~/.kube/config', true) }}" @@ -27,3 +27,4 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" # Other variables for workload tests number_of_concurrent_jobs: "{{ lookup('env', 'NUMBER_OF_CONCURRENT_JOBS')|default(300, true)|int }}" +job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}" \ No newline at end of file From a07b78b05700caee7b71f37cb7b19c48f45f0d40 Mon Sep 17 00:00:00 2001 From: skordas Date: Thu, 24 Oct 2019 15:43:01 -0400 Subject: [PATCH 4/4] Adding smoke test variables. Adding enabling/disabling pbench during workload run. --- docs/concurent-jobs-with-configmaps.md | 14 ++++++ ...current-jobs-with-configmaps-script-cm.yml | 45 +++++++++++++++++++ .../vars/concurrent-jobs-with-configmaps.yml | 13 +++++- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/docs/concurent-jobs-with-configmaps.md b/docs/concurent-jobs-with-configmaps.md index bea33542..3fdb2104 100644 --- a/docs/concurent-jobs-with-configmaps.md +++ b/docs/concurent-jobs-with-configmaps.md @@ -75,3 +75,17 @@ Number of concurrent jobs with configmaps to create during workload. ### JOB_COMPLETION_POLL_ATTEMPTS Default: `360` Number of retries for Ansible to poll if the workload job has completed. Poll attempts delay 10s between polls with some additional time taken for each polling action depending on the orchestration host setup. + +## Smoke test variables + +``` +CONCURRENT_JOBS_NODE_COUNT=4 +CONCURRENT_JOBS_TEST_PREFIX=concurrentjobs_smoke +CONCURRENT_JOBS_CLEANUP=true +CONCURRENT_JOBS_BASENAME=concurrentjobs +CONCURRENT_JOBS_MAXPODS=1000 +CONCURRENT_JOBS_POD_IMAGE="gcr.io/google_containers/pause-amd64:3.0" +CONCURRENT_JOBS_STEPSIZE=50 +CONCURRENT_JOBS_PAUSE=60 +CONCURRENT_JOBS_TS_TIMEOUT=180 +``` \ No newline at end of file diff --git a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml index c72d19fb..9439d62f 100644 --- a/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml +++ b/workloads/files/workload-concurrent-jobs-with-configmaps-script-cm.yml @@ -4,6 +4,51 @@ metadata: name: scale-ci-workload-script data: run.sh: | + #!/bin/sh + set -eo pipefail + workload_log() { echo "$(date -u) $@" >&2; } + export -f workload_log + workload_log "Configuring pbench for Concurrent Jobs with ConfigMaps workload" + mkdir -p /var/lib/pbench-agent/tools-default/ + echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd + if [ "${ENABLE_PBENCH_AGENTS}" = true ]; then + echo "" > /var/lib/pbench-agent/tools-default/disk + echo "" > /var/lib/pbench-agent/tools-default/iostat + echo "workload" > /var/lib/pbench-agent/tools-default/label + echo "" > /var/lib/pbench-agent/tools-default/mpstat + echo "" > /var/lib/pbench-agent/tools-default/oc + echo "" > /var/lib/pbench-agent/tools-default/perf + echo "" > /var/lib/pbench-agent/tools-default/pidstat + echo "" > /var/lib/pbench-agent/tools-default/sar + master_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/master= --no-headers | awk '{print $1}'` + for node in $master_nodes; do + echo "master" > /var/lib/pbench-agent/tools-default/remote@$node + done + infra_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/infra= --no-headers | awk '{print $1}'` + for node in $infra_nodes; do + echo "infra" > /var/lib/pbench-agent/tools-default/remote@$node + done + worker_nodes=`oc get nodes -l pbench_agent=true,node-role.kubernetes.io/worker= --no-headers | awk '{print $1}'` + for node in $worker_nodes; do + echo "worker" > /var/lib/pbench-agent/tools-default/remote@$node + done + fi + source /opt/pbench-agent/profile + workload_log "Done configuring pbench for Concurrent Jobs with ConfigMaps workload" + + workload_log "Running Concurrent Jobs with ConfigMaps workload" + if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then + pbench-user-benchmark -- sh /root/workload/workload.sh + result_dir="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1/sample1 + if [ "${ENABLE_PBENCH_COPY}" = "true" ]; then + pbench-copy-results --prefix ${CONFORMANCE_TEST_PREFIX} + fi + else + sh /root/workload/workload.sh + result_dir=/tmp + fi + workload_log "Completed Conformance run" + workload.sh: | #!/bin/bash jobs_amount=${NUMBER_OF_CONCURRENT_JOBS} function create_jobs() diff --git a/workloads/vars/concurrent-jobs-with-configmaps.yml b/workloads/vars/concurrent-jobs-with-configmaps.yml index 10900514..11c99b0a 100644 --- a/workloads/vars/concurrent-jobs-with-configmaps.yml +++ b/workloads/vars/concurrent-jobs-with-configmaps.yml @@ -27,4 +27,15 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}" # Other variables for workload tests number_of_concurrent_jobs: "{{ lookup('env', 'NUMBER_OF_CONCURRENT_JOBS')|default(300, true)|int }}" -job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}" \ No newline at end of file +job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}" + +# Concurrent jobs smoke test workload specific parameters: +concurrent_jobs_node_count: "{{ lookup('env', 'CONCURRENT_JOBS_NODE_COUNT')|default(4, true)|int }}" +concurrent_jobs_test_prefix: "{{ lookup('env', 'CONCURRENT_JOBS_TEST_PREFIX')|default('concurrentjobs_smoke', true) }}" +concurrent_jobs_cleanup: "{{ lookup('env', 'CONCURRENT_JOBS_CLEANUP')|default(true, true)|bool|lower }}" +concurrent_jobs_basename: "{{ lookup('env', 'CONCURRENT_JOBS_BASENAME')|default('concurrentjobs', true) }}" +concurrent_jobs_maxpods: "{{ lookup('env', 'CONCURRENT_JOBS_MAXPODS')|default(1000, true)|int }}" +concurrent_jobs_pod_image: "{{ lookup('env', 'CONCURRENT_JOBS_POD_IMAGE')|default('gcr.io/google_containers/pause-amd64:3.0', true) }}" +concurrent_jobs_stepsize: "{{ lookup('env', 'CONCURRENT_JOBS_STEPSIZE')|default(50, true)|int }}" +concurrent_jobs_pause: "{{ lookup('env', 'CONCURRENT_JOBS_PAUSE')|default(60, true)|int }}" +concurrent_jobs_ts_timeout: "{{ lookup('env', 'CONCURRENT_JOBS_TS_TIMEOUT')|default(180, true)|int }}"