From b0289e590b7588c089a9589671530b3022e80f09 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 21 Oct 2022 02:31:15 -0400 Subject: [PATCH 01/28] Move the SNMP trap delivery checks (#381) (#384) * Move the SNMP trap delivery checks Move the SNMP trap delivery checks as where they are situated now seems to cause false positives. Moves the checks closer to the end of the smoketest run seems to result in a better change that the logs the check is looking for have been provided. * Use a loop to check for SNMP status with break and max time (cherry picked from commit 1dc0808ace835f2410c35b1b523398ed35f797f8) --- tests/smoketest/smoketest.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index eb1a30cd5..89135ba32 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -75,8 +75,18 @@ done oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') -oc logs "$SNMP_WEBHOOK_POD" | grep 'Sending SNMP trap' -SNMP_WEBHOOK_STATUS=$? +SNMP_WEBHOOK_CHECK_MAX_TRIES=5 +SNMP_WEBHOOK_CHECK_TIMEOUT=30 +SNMP_WEBHOOK_CHECK_COUNT=0 +while [ $SNMP_WEBHOOK_CHECK_COUNT -lt $SNMP_WEBHOOK_CHECK_MAX_TRIES ]; do + oc logs "$SNMP_WEBHOOK_POD" | grep 'Sending SNMP trap' + SNMP_WEBHOOK_STATUS=$? + (( SNMP_WEBHOOK_CHECK_COUNT=SNMP_WEBHOOK_CHECK_COUNT+1 )) + if [ $SNMP_WEBHOOK_STATUS -eq 0 ]; then + break + fi + sleep $SNMP_WEBHOOK_CHECK_TIMEOUT +done echo "*** [INFO] Showing oc get all..." oc get all From 0c4d8aaf314276b60a8d5d148b09f6273d0c3755 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 28 Oct 2022 16:40:22 -0400 Subject: [PATCH 02/28] Lock the bundle to OCP v4.10 (#385) (#386) (cherry picked from commit 1560d3cad81cba367d31858d0c180ff761d937c8) --- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index c6488bfea..eaa25d334 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.10" +LABEL com.redhat.openshift.versions="=v4.10" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ From 9c5b82a1fedf9403d5e28cf9131de70e367e7ef1 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 8 Nov 2022 08:39:15 -0500 Subject: [PATCH 03/28] Fixes for 17.0 ir script (#380) (#383) (cherry picked from commit 69d73b7463a072ab3364d4c8be0bb8fe8a0003c6) Co-authored-by: Chris Sibbitt Co-authored-by: Matthias Runge --- tests/infrared/17.0/infrared-openstack.sh | 2 +- tests/infrared/17.0/stf-connectors.yaml.template | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/infrared/17.0/infrared-openstack.sh b/tests/infrared/17.0/infrared-openstack.sh index 87257d323..17ff67a35 100755 --- a/tests/infrared/17.0/infrared-openstack.sh +++ b/tests/infrared/17.0/infrared-openstack.sh @@ -95,7 +95,7 @@ ir_create_undercloud() { } stf_create_config() { - sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml } gnocchi_create_config() { diff --git a/tests/infrared/17.0/stf-connectors.yaml.template b/tests/infrared/17.0/stf-connectors.yaml.template index b667fd835..c29b518e7 100644 --- a/tests/infrared/17.0/stf-connectors.yaml.template +++ b/tests/infrared/17.0/stf-connectors.yaml.template @@ -10,15 +10,15 @@ custom_templates: # set parameter defaults to match stable-1.3 documentation parameter_defaults: MetricsQdrConnectors: - - host: <> - port: <> - role: edge - verifyHostname: false - sslProfile: sslProfile + - host: <> + port: <> + role: edge + verifyHostname: false + sslProfile: sslProfile MetricsQdrSSLProfiles: - - name: sslProfile - caCertFileContent: | + - name: sslProfile + caCertFileContent: | <> CeilometerQdrEventsConfig: From 907dbc4238ec1a308cf2ef67cecef98c0d7ed49d Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 14 Nov 2022 16:32:52 -0500 Subject: [PATCH 04/28] Make all certs 8yr expiry (#387) (#389) * Make all certs 8yr expiry * Use certificate_duration and test against generated cert * Better messages during CI cloning (cherry picked from commit 4b5d7a156cf38cd862825f845d021630bc1b1c20) --- build/stf-run-ci/tasks/clone_repos.yml | 10 +++++----- roles/servicetelemetry/defaults/main.yml | 2 ++ .../tasks/_local_signing_authority.yml | 4 ++++ roles/servicetelemetry/tasks/component_qdr.yml | 8 ++++++++ tests/smoketest/smoketest.sh | 9 +++++++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 883211090..97e351bdc 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -5,7 +5,7 @@ # of these separately rather than using a loop. - name: Get Smart Gateway Operator block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from SGO repository git: repo: "{{ sgo_repository }}" dest: working/smart-gateway-operator @@ -19,7 +19,7 @@ - name: Get sg-core block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from sg-core repository git: repo: "{{ sg_core_repository }}" dest: working/sg-core @@ -33,7 +33,7 @@ - name: Get sg-bridge block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from sg-bridge repository git: repo: "{{ sg_bridge_repository }}" dest: working/sg-bridge @@ -47,7 +47,7 @@ - name: Get prometheus-webhook-snmp block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository git: repo: "{{ prometheus_webhook_snmp_repository }}" dest: working/prometheus-webhook-snmp @@ -64,7 +64,7 @@ # branches there. - name: Get Loki Operator block: - - name: Try cloning same-named branch or override branch from specified repository + - name: Try cloning same-named branch or override branch from loki repository git: repo: "{{ loki_operator_repository }}" dest: working/loki diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 95346f137..2ba6d4b43 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -6,6 +6,8 @@ clouds_remove_on_missing: false # default observability strategy (compatible with STF 1.3) observability_strategy: use_community +certificate_duration: 70080h + servicetelemetry_defaults: high_availability: enabled: false diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index 346e2c770..f4ae3fd62 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -8,6 +8,7 @@ name: '{{ ansible_operator_meta.namespace }}-selfsigned' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' selfSigned: {} - name: Create CA certificate @@ -19,6 +20,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -34,6 +36,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' ca: secretName: '{{ ansible_operator_meta.namespace }}-ca' @@ -47,6 +50,7 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: + duration: '{{ certificate_duration }}' commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 2247db84a..cf7cc937b 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -13,6 +13,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' selfSigned: {} - name: Create self-signed interconnect certificate @@ -25,6 +26,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -42,6 +44,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" @@ -55,6 +58,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -71,6 +75,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -88,6 +93,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' ca: secretName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" @@ -101,6 +107,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -117,6 +124,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: + duration: '{{ certificate_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 89135ba32..aa82145cf 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -73,6 +73,15 @@ for NAME in "${CLOUDNAMES[@]}"; do RET=$((RET || $?)) # Accumulate exit codes done +echo "*** [INFO] Checking that the qdr certificate has a long expiry" +EXPIRETIME=$(oc get secret default-interconnect-openstack-ca -o json | grep \"tls.crt\"\: | awk -F '": "' '{print $2}' | rev | cut -c3- | rev | base64 -d | openssl x509 -in - -text | grep "Not After" | awk -F " : " '{print $2}') +EXPIRETIME_UNIX=$(date -d "${EXPIRETIME}" "+%s") +TARGET_UNIX=$(date -d "now + 7 years" "+%s") +if [ ${EXPIRETIME_UNIX} -lt ${TARGET_UNIX} ]; then + echo "[FAILURE] Certificate expire time (${EXPIRETIME}) less than 7 years from now" +fi + +echo "*** [INFO] Waiting to see SNMP trap message in webhook pod" oc delete pod curl SNMP_WEBHOOK_POD=$(oc get pod -l "app=default-snmp-webhook" -ojsonpath='{.items[0].metadata.name}') SNMP_WEBHOOK_CHECK_MAX_TRIES=5 From 90ae8eed2899dd931bcfbcf07947c94210a3a950 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 30 Nov 2022 11:50:02 -0500 Subject: [PATCH 05/28] Update metadata to reference stable-1.5 (#397) Update the metadata to reference stable-1.5 so that we can perform nightly builds against upstream stable-1.5 repository and result in an index image that can be referenced by a CatalogSource manifest to allow installation of STF 1.5 content. --- build/metadata.sh | 10 +++++----- .../metadata/annotations.yaml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build/metadata.sh b/build/metadata.sh index 38c314c4d..f2020b050 100644 --- a/build/metadata.sh +++ b/build/metadata.sh @@ -8,20 +8,20 @@ OPERATOR_SDK=${OPERATOR_SDK:-operator-sdk} OPERATOR_NAME=${OPERATOR_NAME:-service-telemetry-operator} IMAGE_BUILDER=${IMAGE_BUILDER:-podman} IMAGE_BUILD_ARGS=${IMAGE_BUILD_ARGS:-''} -IMAGE_TAG=${IMAGE_TAG:-latest} +IMAGE_TAG=${IMAGE_TAG:-stable-1.5} REQUIRED_OPERATOR_SDK_VERSION=${REQUIRED_OPERATOR_SDK_VERSION:-v0.19.4} SERVICE_TELEMETRY_SUBSCRIPTION=${SERVICE_TELEMETRY_SUBSCRIPTION:-service-telemetry-operator-stable-infrawatch-operators-openshift-marketplace} OPERATOR_IMAGE=${OPERATOR_IMAGE:-"quay.io/infrawatch/${OPERATOR_NAME}"} -OPERATOR_TAG=${OPERATOR_TAG:-latest} +OPERATOR_TAG=${OPERATOR_TAG:-stable-1.5} OPERATOR_CSV_MAJOR_VERSION=${OPERATOR_CSV_MAJOR_VERSION:-1.5} OPERATOR_DOCUMENTATION_URL=${OPERATOR_DOCUMENTATION_URL:-"https://infrawatch.github.io/documentation"} BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND=${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND:-1.3.0} CREATED_DATE=${CREATED_DATE:-$(date +'%Y-%m-%dT%H:%M:%SZ')} RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP:-quay.io/infrawatch/prometheus-webhook-snmp} -RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-latest} +RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-stable-1.5} BUNDLE_PATH=${BUNDLE_PATH:-deploy/olm-catalog/service-telemetry-operator} -BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-unstable} -BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-unstable} +BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-stable-1.5} +BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-stable-1.5} OPERATOR_BUNDLE_IMAGE=${OPERATOR_BUNDLE_IMAGE:-"quay.io/infrawatch-operators/${OPERATOR_NAME}-bundle"} # Automatic diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/annotations.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/annotations.yaml index 8b7854f90..949739f80 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/annotations.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/annotations.yaml @@ -1,6 +1,6 @@ annotations: - operators.operatorframework.io.bundle.channel.default.v1: unstable - operators.operatorframework.io.bundle.channels.v1: unstable + operators.operatorframework.io.bundle.channel.default.v1: stable-1.5 + operators.operatorframework.io.bundle.channels.v1: stable-1.5 operators.operatorframework.io.bundle.manifests.v1: manifests/ operators.operatorframework.io.bundle.mediatype.v1: registry+v1 operators.operatorframework.io.bundle.metadata.v1: metadata/ From c3868c413c1c1617505d09f3a7c375e7ad4a861c Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Tue, 28 Feb 2023 09:28:58 -0500 Subject: [PATCH 06/28] GHA checkout action v2 is deprecated (#407) (#408) The GitHub Actions checkout action v2 is deprecated and needs to move to version 3. (cherry picked from commit 7687cd7744cf118a0abbbb550e1bbd148efeeffb) --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d5e04ce09..d774c0e24 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Ansible run: python -m pip install 'ansible <= 2.9' @@ -33,7 +33,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Verify image builds run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . @@ -44,7 +44,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Get operator-sdk image run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu From 1dbc11210c8d4536ad832c7952d6d64b1f5d6253 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 1 Mar 2023 17:21:57 -0500 Subject: [PATCH 07/28] CI change to pre-clean cert-manager-operator (#409) * not 100% sure this is 4.12 related, but it's new and first seen during testing 4.12 (cherry picked from commit 37dceed7e55856820c10fb812da0ed9cd6551a3b with modifications) Co-authored-by: Chris Sibbitt --- build/stf-run-ci/tasks/pre-clean.yml | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index f443bfc2d..208b130cd 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -29,3 +29,33 @@ kind: clusterroles label_selectors: - "olm.owner.namespace = {{ namespace }}" + +# Clean the environment if it has OperatorHub.io CatalogSource still enabled as +# environment is using community-operators CatalogSource when use_community has +# been enabled. This avoids installing an additional CatalogSource which is no +# longer required. +- name: Remove OperatorHub.io CatalogSource if it installed + k8s: + state: absent + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: operatorhubio-operators + namespace: openshift-marketplace + spec: + sourceType: grpc + image: quay.io/operatorhubio/catalog:latest + displayName: OperatorHub.io Operators + publisher: OperatorHub.io + +# Remove the cert manager since we install it as part of the CI/documented pre-install process +- name: Remove openshift-cert-manager-operator namespace + k8s: + state: absent + wait: yes + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: openshift-cert-manager-operator From ecc5e5e513dea36cb87071e146b1d41436086866 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Tue, 7 Mar 2023 12:34:23 -0500 Subject: [PATCH 08/28] STF 1.5.1 release ops (#413) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixes for 17.0 ir script (#380) * Move the SNMP trap delivery checks (#381) * Move the SNMP trap delivery checks Move the SNMP trap delivery checks as where they are situated now seems to cause false positives. Moves the checks closer to the end of the smoketest run seems to result in a better change that the logs the check is looking for have been provided. * Use a loop to check for SNMP status with break and max time * Lock the bundle to OCP v4.10 (#385) * Make all certs 8yr expiry * Revert "Make all certs 8yr expiry" This reverts commit af35714b5d08720063b6dc0e6a3e46962ff1ec64. * Make all certs 8yr expiry (#387) * Make all certs 8yr expiry * Use certificate_duration and test against generated cert * Better messages during CI cloning * Expand support for OCP 4.11 (#391) * Expand support for OCP 4.11 Allow installation to be done on OCP 4.11 while updating the smoketest jobs to support later versions of the client. Also migrate to using community-operators CatalogSource instead of OperatorHub.io. Only enable community-operators when the use_community strategy is enabled. Update the token request syntax when requesting a service account token. Add checks to look for oc client version and fail if we're using a version that's too old. * Make passwords safer in smoketest job template Encapsulate the password values with double quotes to help make them safer for consumption in the template. I had an odd situation where the password contained a bunch of extended characters and caused the smoketest to report an error on the template having an issue with yaml to json. The password contained several characters such as . and : which confused the template. Wrapping the contents in the double quotes allowed the smoketest to apply the job.batch template and result in a working smoketest run. * Force SGO checkout during build (#388) * Replacing the placeholder namespace during the build results in a "there are local changes" error on next build * This forces the checkout to discard that (and other!?) local changes * Quicker dev/test loop * Update oc to 4.11 in jenkins agent (#393) * Update oc to 4.11 in jenkins agent Need 4.11 for new token handling changes * Remove OperatorHub.io as a CatalogSource (#394) Remove the OperatorHub.io CatalogSource and instead use the community-operators CatalogSource which is available with an OCP installation. Ideally this will avoid some of the conflicts we've been seeing in our CI environment. This is a short term fix as future development will likely make use of Observability Operator to provide the metrics data store and alert delivery mechanism. * Changes for 4.12 (#401) * Catalog changes * CI change to pre-clean cert-manager-operator * not 100% sure this is 4.12 related, but it's new and first seen during testing 4.12 * Remove Loki from stf-run-ci (#405) * Remove Loki from stf-run-ci * Return "Get new operator sdk" to stf-run-ci * GHA checkout action v2 is deprecated (#407) The GitHub Actions checkout action v2 is deprecated and needs to move to version 3. * Implement SNMPtrap delivery controls (#404) * Implement SNMPtrap delivery controls Implement ability to override the default values for the SNMPtrap alertmanager receiver via prometheus-webhook-snmp component. Closes: STF-559 * Run operator-sdk generate bundle Run the following command to update the bundle artifacts: operator-sdk-0.19.4 generate bundle --metadata --manifests --channels unstable --default-channel unstable * Build out the remaining SNMP options Build out the remaining options for prometheus-webhook-snmp to allow for finer grained controls and delivery of SNMP traps via alertmanager alerts. * Generate bundle contents with operator-sdk * Implement changes for operator-sdk-1.26.0 testing (#411) * Implement changes for operator-sdk-1.26.0 testing Implement changes that allow testing validation via operator-sdk-1.26.0 without bumping the entire bundle generation process from operator-sdk-0.19.4 to post-operator-sdk-1.x. These are the same tests run for validation during product pipeline verification. * Adds test to verify building of the bundle image works. * Adds KinD deployment to allow executing scorecard checks. Related: STF-1252 * Fix properties.yaml * Simplify use of RELEASE_VERSION variable (#412) * Add note about why we're copying files in * Expose ability to set certificate renewal target times (#406) * Adds duration param for CA and endpoint certs Replaces certificate_duration for ca_certificate_duration and endpoint_certificate_duration. Set default value for those to 70080h (previous value) Removes the certificate_duration param from the Issuer resource since it's not actually needed (see [0]) [0] https://cert-manager.io/docs/reference/api-docs/#cert-manager.io/v1.IssuerConfig * Exposes CA and endpoint certificate duration config Exposes certificate duration config for both ElasticSearch and QDR Keeps the default value in use for now. Better default values should be discussed to be included in a follow up change. * Fix identation for certs duration param in servicetelemetry crd * Adds cert duration to the OLM catalog Includes cert duration params in the OLM catalog for both ElasticSearch and QDR * Changes snake_case to camelCase to yaml case Fix to match style convention * Adds pattern expresion for certs duration * Add certificates param to events and transport * Exposes duration parameter in the CI script Adds the duration parameter for both ElasticSearch and QDR in the CI script Also updates the OLM Catalog with the latest changes (certificates object) * Corrects naming to certificates params in CI script * Fix snake cae in the CI script params for cert duration * Fix identation for transports in the deploy_stf CI script --------- Co-authored-by: Chris Sibbitt --------- Co-authored-by: Leif Madsen Co-authored-by: Jaromír Wysoglad Co-authored-by: Victoria Martinez de la Cruz --- .github/workflows/main.yml | 70 ++++++++-- .jenkins/agent/Dockerfile | 2 + build/generate_bundle.sh | 13 ++ build/stf-run-ci/README.md | 25 +++- build/stf-run-ci/defaults/main.yml | 30 ++-- build/stf-run-ci/tasks/clone_repos.yml | 23 +--- build/stf-run-ci/tasks/deploy_stf.yml | 20 ++- build/stf-run-ci/tasks/main.yml | 73 +--------- build/stf-run-ci/tasks/pre-clean.yml | 1 - build/stf-run-ci/tasks/setup_base.yml | 20 +-- .../tasks/setup_stf_local_build.yml | 129 ------------------ .../infra.watch_servicetelemetrys_crd.yaml | 52 ++++++- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 14 ++ .../service-telemetry-operator/Dockerfile.in | 2 +- .../infra.watch_servicetelemetrys_crd.yaml | 68 ++++++++- ...emetry-operator.clusterserviceversion.yaml | 19 ++- .../metadata/properties.yaml | 3 + .../tests/scorecard/config.yaml | 21 +++ roles/servicetelemetry/defaults/main.yml | 14 ++ .../tasks/_local_signing_authority.yml | 4 +- .../servicetelemetry/tasks/component_qdr.yml | 10 +- .../templates/manifest_snmp_traps.j2 | 16 ++- tests/smoketest/smoketest.sh | 13 +- tests/smoketest/smoketest_job.yaml.template | 8 +- 24 files changed, 350 insertions(+), 300 deletions(-) create mode 100644 deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml create mode 100644 deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d774c0e24..5cbce3e9e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,15 +20,9 @@ jobs: - name: Lint Ansible roles/servicetelemetry/ directory run: ${HOME}/.local/bin/ansible-lint roles/servicetelemetry -# TODO: requires a bunch of work on our bash scripts, or finesse -# - name: Run Super-Linter -# uses: github/super-linter@v3 -# env: -# DEFAULT_BRANCH: master -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - build-check: - name: Build check + + build-operator-check: + name: Build Operator check runs-on: ubuntu-20.04 steps: @@ -38,15 +32,46 @@ jobs: - name: Verify image builds run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . - bundle-check: - name: Bundle check + build-bundle-check: + name: Build bundle check runs-on: ubuntu-20.04 + env: + RELEASE_VERSION: v0.19.4 steps: - name: Checkout code uses: actions/checkout@v3 - - name: Get operator-sdk image + - name: Get operator-sdk image 0.19.4 + run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu + + - name: Make operator-sdk executable + run: chmod +x operator-sdk + + - name: Move operator-sdk binary + run: sudo mv operator-sdk /usr/local/bin + + - name: Create working directory + run: mkdir /tmp/bundle + + - name: Generate bundle + run: WORKING_DIR=/tmp/bundle ./build/generate_bundle.sh + + - name: Verify image builds + run: docker build --tag infrawatch/service-telemetry-operator:latest --file build/Dockerfile . + + check-bundle-validation-scorecard: + name: Validate the generated bundle and perform scorecard checks + runs-on: ubuntu-20.04 + env: + RELEASE_VERSION: v1.26.0 + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + # prepare environment to buld the bundle + - name: Get operator-sdk image 0.19.4 run: curl --output operator-sdk -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk-$RELEASE_VERSION-x86_64-linux-gnu env: RELEASE_VERSION: v0.19.4 @@ -60,8 +85,27 @@ jobs: - name: Create working directory run: mkdir /tmp/bundle + # generate the bundle using operator-sdk-0.19.4 - name: Generate bundle run: WORKING_DIR=/tmp/bundle ./build/generate_bundle.sh + # prepare the environment to run bundle validation and bundle scorecard checks + - name: Get operator-sdk image 1.26.0 + run: curl --output operator-sdk-$RELEASE_VERSION -JL https://github.com/operator-framework/operator-sdk/releases/download/$RELEASE_VERSION/operator-sdk_linux_amd64 + + - name: Make operator-sdk executable + run: chmod +x operator-sdk-$RELEASE_VERSION + + - name: Move operator-sdk binary + run: sudo mv operator-sdk-$RELEASE_VERSION /usr/local/bin + + # perform bundle validation - name: Check bundle validation - run: operator-sdk bundle validate --verbose /tmp/bundle + run: operator-sdk-$RELEASE_VERSION bundle validate --verbose /tmp/bundle + + - name: Create KinD cluster to execute scorecard tests + uses: helm/kind-action@v1.4.0 + + # perform scorecard checks against a KinD cluster + - name: Check scorecord validation + run: operator-sdk-$RELEASE_VERSION scorecard --verbose /tmp/bundle diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index 5e4b026d8..fe1a7fa70 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -6,3 +6,5 @@ RUN dnf install -y ansible golang python38 && \ alternatives --set python /usr/bin/python3.8 && \ python -m pip install openshift kubernetes "ansible-core~=2.12" && \ ansible-galaxy collection install -f 'kubernetes.core:>=2.2.0' community.general +RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-4.11/openshift-client-linux.tar.gz" && \ + tar -xv -C /usr/local/bin -f openshift-client-linux.tar.gz diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index bfccecc4c..8c5b13934 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -38,6 +38,18 @@ generate_bundle() { echo "---- Generated bundle complete at ${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" } +copy_extra_metadata() { + # We add this because our version of operator-sdk for building doesn't + # understand these files, but newer versions of operator-sdk (for testing + # purposes) does, and newer versions of opm (as used in both downstream and + # upstream index image builds) also understands these files. Just copy them + # into the bundle directory during building. + echo "-- Copy extra metadata in" + pushd "${REL}/../" + cp -r ./deploy/olm-catalog/service-telemetry-operator/tests/ "${WORKING_DIR}" + cp ./deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml "${WORKING_DIR}/metadata/" +} + build_bundle_instructions() { echo "-- Commands to create a bundle build" echo docker build -t "${OPERATOR_BUNDLE_IMAGE}:${OPERATOR_BUNDLE_VERSION}" -f "${WORKING_DIR}/Dockerfile" "${WORKING_DIR}" @@ -51,5 +63,6 @@ generate_version create_working_dir generate_dockerfile generate_bundle +copy_extra_metadata build_bundle_instructions echo "## End Bundle creation" diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 4e654bb44..87ca2f0aa 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -21,10 +21,10 @@ choose to override: | Parameter name | Values | Default | Description | | ------------------------------ | ------------ | --------- | ------------------------------------ | | `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with __local_build_enabled) | -| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__service_telemetry_bundle_image_path` | | | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | | Image path to Smart Gateway Operator bundle | | `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | | `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | | `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | @@ -35,18 +35,29 @@ choose to override: | `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | | `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | | `loki_operator_repository` | | https://github.com/viaq/loki-operator | Which Loki-operator git repository to clone | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | | `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | | `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | | `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | | `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | | `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | | `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | | `__service_telemetry_logs_enabled` | {true,false} | false | Whether to enable logs support in ServiceTelemetry | -| `__service_telemetry_observability_strategy` | | use_community | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | +| `__service_telemetry_observability_strategy` | | `use_community` | Which observability strategy to use for deployment. Default deployment is 'use_community'. Also supported is 'none' | +| `__service_telemetry_transports_certificates_endpoint_cert_duration`| [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 2160h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | | `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | -| `__deploy_minio_enabled` | {true,false} | false | Whether to deploy minio while deploying loki-operator for logging development purposes | | `__deploy_loki_enabled` | {true,false} | false | Whether to deploy loki-operator and other systems for logging development purposes | -| `__loki_skip_tls_verify` | {true,false} | false | Whether to skip TLS verify for Loki S3 connection | | `__golang_image_path` | | quay.io/infrawatch/golang:1.16 | Golang image path for building the loki-operator image | | `__loki_image_path` | | quay.io/infrawatch/loki:2.2.1 | Loki image path for Loki microservices | diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 880e0cc55..9b54f0ce3 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -9,55 +9,51 @@ __local_build_enabled: true __deploy_from_bundles_enabled: false __deploy_stf: true +__service_telemetry_events_certificates_endpoint_cert_duration: 70080h +__service_telemetry_events_certificates_ca_cert_duration: 2160h __service_telemetry_events_enabled: true __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true __service_telemetry_storage_ephemeral_enabled: false __service_telemetry_snmptraps_enabled: true +__service_telemetry_snmptraps_target: "192.168.24.254" +__service_telemetry_snmptraps_community: "public" +__service_telemetry_snmptraps_retries: 5 +__service_telemetry_snmptraps_timeout: 1 +__service_telemetry_snmptraps_port: 162 +__service_telemetry_snmptraps_alert_oid_label: "oid" +__service_telemetry_snmptraps_trap_oid_prefix: "1.3.6.1.4.1.50495.15" +__service_telemetry_snmptraps_trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" +__service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community +__service_telemetry_transports_certificates_endpoint_cert_duration: 70080h +__service_telemetry_transports_certificates_ca_cert_duration: 2160h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: -__deploy_minio_enabled: false -__deploy_loki_enabled: false -__loki_skip_tls_verify: false -__golang_image_path: quay.io/infrawatch/golang:1.16 -__loki_image_path: quay.io/infrawatch/loki:2.4.2 - sgo_image_tag: latest sto_image_tag: latest sg_core_image_tag: latest sg_bridge_image_tag: latest prometheus_webhook_snmp_image_tag: latest -loki_operator_image_tag: latest new_operator_sdk_version: v1.11.0 -new_go_version: 1.16.3 namespace: service-telemetry pull_secret_registry: pull_secret_user: pull_secret_pass: -# Set a default commit hash to clone for loki-operator to freeze -# the operator developement. -loki_operator_branch: b8e9973 - - # used when building images to default to correct version branch for STF subcomponents per STF version version_branches: sgo: master sg_core: master sg_bridge: master prometheus_webhook_snmp: master - loki_operator: master sgo_repository: https://github.com/infrawatch/smart-gateway-operator sg_core_repository: https://github.com/infrawatch/sg-core sg_bridge_repository: https://github.com/infrawatch/sg-bridge prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-webhook-snmp -loki_operator_repository: https://github.com/grafana/loki - -loki_operator_folder: operator base_dir: '' diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 97e351bdc..d4f2173d3 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -10,6 +10,7 @@ repo: "{{ sgo_repository }}" dest: working/smart-gateway-operator version: "{{ sgo_branch | default(branch, true) }}" + force: yes rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" git: @@ -58,25 +59,3 @@ repo: https://github.com/infrawatch/prometheus-webhook-snmp dest: working/prometheus-webhook-snmp version: "{{ version_branches.prometheus_webhook_snmp }}" - -# Branches for Loki Operator don't work the same as with other repositories. -# We don't have write access to the upstream repository to create our own -# branches there. -- name: Get Loki Operator - block: - - name: Try cloning same-named branch or override branch from loki repository - git: - repo: "{{ loki_operator_repository }}" - dest: working/loki - version: "{{ loki_operator_branch | default(branch, true) }}" - force: yes - rescue: - - name: "Get {{ version_branches.loki_operator }} upstream branch because specified branch or repository doesn't exist" - git: - repo: https://github.com/grafana/loki - dest: working/loki - version: "{{ version_branches.loki_operator }}" - force: yes - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" diff --git a/build/stf-run-ci/tasks/deploy_stf.yml b/build/stf-run-ci/tasks/deploy_stf.yml index 596509cf5..bc49897c0 100644 --- a/build/stf-run-ci/tasks/deploy_stf.yml +++ b/build/stf-run-ci/tasks/deploy_stf.yml @@ -21,6 +21,15 @@ receivers: snmpTraps: enabled: {{ __service_telemetry_snmptraps_enabled }} + target: "{{ __service_telemetry_snmptraps_target }}" + community: "{{ __service_telemetry_snmptraps_community }}" + retries: {{ __service_telemetry_snmptraps_retries }} + port: {{ __service_telemetry_snmptraps_port }} + timeout: {{ __service_telemetry_snmptraps_timeout }} + alertOidLabel: "{{ __service_telemetry_snmptraps_alert_oid_label }}" + trapOidPrefix: "{{ __service_telemetry_snmptraps_trap_oid_prefix }}" + trapDefaultOid: "{{ __service_telemetry_snmptraps_trap_default_oid }}" + trapDefaultSeverity: "{{ __service_telemetry_snmptraps_trap_default_severity }}" backends: events: elasticsearch: @@ -31,6 +40,9 @@ persistent: storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} + certificates: + endpointCertDuration: {{ __service_telemetry_events_certificates_endpoint_cert_duration }} + caCertDuration: {{ __service_telemetry_events_certificates_ca_cert_duration }} metrics: prometheus: enabled: {{ __service_telemetry_metrics_enabled }} @@ -42,7 +54,7 @@ {% endif %} logs: loki: - enabled: {{ __service_telemetry_logs_enabled }} + enabled: false replicationFactor: 1 flavor: 1x.extra-small storage: @@ -50,6 +62,12 @@ {% if __service_telemetry_storage_persistent_storage_class is defined %} storageClass: {{ __service_telemetry_storage_persistent_storage_class }} {% endif %} + transports: + qdr: + enabled: true + certificates: + endpointCertDuration: {{ __service_telemetry_transports_certificates_endpoint_cert_duration }} + caCertDuration: {{ __service_telemetry_transports_certificates_ca_cert_duration }} highAvailability: enabled: {{ __service_telemetry_high_availability_enabled }} when: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index 5216d4acf..3041d22ea 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -12,8 +12,6 @@ sg_core_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-core:{{ sg_core_image_tag }}" sg_bridge_image_path: "{{ __internal_registry_path }}/{{ namespace }}/sg-bridge:{{ sg_bridge_image_tag }}" prometheus_webhook_snmp_image_path: "{{ __internal_registry_path }}/{{ namespace }}/prometheus-webhook-snmp:{{ prometheus_webhook_snmp_image_tag }}" - loki_operator_image_path: "{{ __internal_registry_path }}/{{ namespace }}/loki-operator:{{ loki_operator_image_tag }}" - loki_operator_bundle_image_path: "{{ __internal_registry_path }}/{{ namespace }}/loki-operator-bundle:{{ loki_operator_image_tag }}" - name: Fail on mutually exclusive flags fail: @@ -43,7 +41,7 @@ when: base_dir | length == 0 - name: Get new operator sdk - when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool or __deploy_loki_enabled | bool + when: __local_build_enabled | bool or __deploy_from_bundles_enabled | bool command: "{{ base_dir }}/get_new_operator_sdk.sh {{ new_operator_sdk_version }}" - when: __local_build_enabled | bool @@ -53,61 +51,6 @@ tags: - clone - - block: - - name: Move loki-operator to loki-operator folder - command: rm -rf "{{ base_dir }}/working/loki-operator" - command: mv "{{ base_dir }}/working/loki/{{ loki_operator_folder }}" "{{ base_dir }}/working/loki-operator" - - - name: Get new go - command: "{{ base_dir }}/get_go.sh {{ new_go_version }}" - when: __deploy_loki_enabled | bool - - # TLS verification support doesn't seem to be implemented in the operator yet - - block: - - name: Prepare for skip Loki TLS patch - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "\ \ \ \ insecure: false\n -\ \ \ http_config:\n -\ \ \ \ \ insecure_skip_verify: true" - replace: "" - - - name: Skip Loki TLS verification - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "\ \ \ \ s3forcepathstyle: true" - replace: "\ \ \ \ s3forcepathstyle: true\n -\ \ \ insecure: false\n -\ \ \ http_config:\n -\ \ \ \ \ insecure_skip_verify: true" - when: - - __deploy_loki_enabled | bool - - __loki_skip_tls_verify | bool - - __service_telemetry_observability_strategy == "use_community" - - - name: Remove forced multi-tenancy from loki-operator config - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/internal/config/loki-config.yaml" - regexp: "auth_enabled: true" - replace: "auth_enabled: false" - when: __deploy_loki_enabled | bool - - - block: - - name: Replace loki-operator golang base image - replace: - path: "{{ base_dir }}/working/loki-operator/Dockerfile" - regexp: "FROM golang:1.16 as builder" - replace: "FROM {{ __golang_image_path }} as builder" - - - name: Replace Loki image - replace: - path: "{{ base_dir }}/working/loki-operator/internal/manifests/var.go" - regexp: "docker.io/grafana/loki:\\d\\.\\d\\.\\d" - replace: "{{ __loki_image_path }}" - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - - name: Create base build list set_fact: build_list: @@ -117,20 +60,6 @@ - { name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: ./working/sg-bridge } - { name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: ./working/prometheus-webhook-snmp } - - block: - - name: Create Loki build list - set_fact: - loki_build_list: - - { name: loki-operator-bundle, dockerfile_path: bundle.Dockerfile, image_reference_name: loki_operator_bundle_image_path, working_build_dir: ./working/loki-operator } - - { name: loki-operator, dockerfile_path: Dockerfile, image_reference_name: loki_operator_image_path, working_build_dir: ./working/loki-operator } - - - name: Combine lists when community operators are enabled - set_fact: - build_list: "{{ build_list + loki_build_list }}" - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - - debug: var: build_list diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 208b130cd..d86093cce 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -8,7 +8,6 @@ loop: - smartgateways.smartgateway.infra.watch - servicetelemetrys.infra.watch - - lokistacks.loki.openshift.io tags: - clean-crds diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 13944668d..9b0c838f9 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -13,6 +13,8 @@ name: certified-operators - disabled: false name: redhat-operators + - disabled: "{{ false if __service_telemetry_observability_strategy == 'use_community' else true }}" + name: community-operators - name: Create OperatorGroup k8s: @@ -63,22 +65,6 @@ source: redhat-operators sourceNamespace: openshift-marketplace -- name: Enable OperatorHub.io for Elastic Cloud on Kubernetes - k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: CatalogSource - metadata: - name: operatorhubio-operators - namespace: openshift-marketplace - spec: - sourceType: grpc - image: quay.io/operatorhubio/catalog:latest - displayName: OperatorHub.io Operators - publisher: OperatorHub.io - when: - - __service_telemetry_observability_strategy == "use_community" - - name: Subscribe to Elastic Cloud on Kubernetes Operator k8s: definition: @@ -123,7 +109,7 @@ channel: beta installPlanApproval: Automatic name: prometheus - source: operatorhubio-operators + source: community-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy == "use_community" diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index baf17138e..a7c3c2578 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -81,132 +81,3 @@ - name: Load Service Telemetry Operator CSV shell: oc apply -f working/service-telemetry-operator-bundle/manifests/service-telemetry-operator.clusterserviceversion.yaml -n "{{ namespace }}" -# --- Loki Operator --- -- block: - - name: Prevent Loki Operator from building operator-sdk - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^.*modfile=operator-sdk.mod.*$' - replace: '' - - - name: Prevent Loki Operator from replacing GOBIN - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^GOBIN.*$' - replace: 'GOBIN ?= $(shell go env GOBIN)' - - - name: Prevent Loki Operator from using system golang - replace: - path: "{{ base_dir }}/working/loki-operator/.bingo/Variables.mk" - regexp: '^GO .*$' - replace: 'GO ?= $(GOBIN)"/go"' - - - name: Prevent Loki Operator from putting authentication on /metrics - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/openshift/kustomization.yaml" - regexp: '{{ item }}' - replace: '' - loop: - - "patchesStrategicMerge:" - - "- manager_auth_proxy_patch.yaml" - - "- manager_related_image_patch.yaml" - - "- manager_run_flags_patch.yaml" - - "- prometheus_service_monitor_patch.yaml" - - - name: Generate Loki Operator CSV - make: - chdir: "{{ base_dir }}/working/loki-operator" - target: bundle - params: - REGISTRY_ORG: infrawatch - OPERATOR_SDK: "{{ base_dir }}/working/operator-sdk" - GOROOT: "{{ base_dir }}/working/go" - GOTOOLDIR: "{{ base_dir }}/working/go/pkg/tool/linux_amd64" - GOBIN: "{{ base_dir }}/working/go/bin" - - - name: Replace namespace in loki-operator CSV - replace: - path: "{{ base_dir }}/working/loki-operator/bundle/manifests/loki-operator.clusterserviceversion.yaml" - regexp: 'placeholder' - replace: '{{ namespace }}' - - - name: Replace image path in loki-operator CSV - replace: - path: "{{ base_dir }}/working/loki-operator/bundle/manifests/loki-operator.clusterserviceversion.yaml" - regexp: '{{ item }}' - replace: '{{ loki_operator_image_path }}' - loop: - - quay.io/infrawatch/loki-operator:v0.0.1 - - quay.io/openshift-logging/loki-operator:v0.0.1 - - - name: Replace namespace in loki-operator - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: 'default' - replace: '{{ namespace }}' - - - name: Remove additional manager deployment - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '^.*manager' - replace: '' - - - name: Remove unnecessary patches - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '.*patch.*' - replace: '' - - - name: Setup PVs for Loki in crc - shell: - cmd: ./create_standard_pvs.sh 4 - when: - - is_crc | bool - - __service_telemetry_logs_enabled | bool - - - name: Replace namespace in S3 secret - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/minio/secret.yaml" - regexp: 'default' - replace: '{{ namespace }}' - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" - -- block: - - name: Remove minio deployment - replace: - path: "{{ base_dir }}/working/loki-operator/config/overlays/development/kustomization.yaml" - regexp: '^.*minio' - replace: '' - when: - - not __deploy_minio_enabled | bool or - not __service_telemetry_observability_strategy == "use_community" or - not __service_telemetry_logs_enabled | bool - - __deploy_loki_enabled | bool - -- block: - - name: Deploy Loki Operator - make: - chdir: "{{ base_dir }}/working/loki-operator" - target: deploy - params: - REGISTRY_ORG: infrawatch - OPERATOR_SDK: "{{ base_dir }}/working/operator-sdk" - GOROOT: "{{ base_dir }}/working/go" - GOTOOLDIR: "{{ base_dir }}/working/go/pkg/tool/linux_amd64" - GOBIN: "{{ base_dir }}/working/go/bin" - - - name: Load Loki Operator bundle manifests - command: oc apply -f working/loki-operator/bundle/manifests/{{ item }} -n "{{ namespace }}" - loop: - - loki.grafana.com_lokistacks.yaml - - loki-operator-controller-manager-metrics-service_v1_service.yaml - - loki-operator-manager-config_v1_configmap.yaml - - loki-operator-metrics-reader_rbac.authorization.k8s.io_v1_clusterrole.yaml - - loki-operator-prometheus_rbac.authorization.k8s.io_v1_rolebinding.yaml - - loki-operator-prometheus_rbac.authorization.k8s.io_v1_role.yaml - - loki-operator.clusterserviceversion.yaml - when: - - __deploy_loki_enabled | bool - - __service_telemetry_observability_strategy == "use_community" diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 2427b4eb6..21e9f8652 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -56,8 +56,32 @@ spec: enabled: description: Deploy container to send snmp traps type: boolean + community: + description: 'Target community for SNMP traps. Default is "public"' + type: string target: - description: Target address for SNMP traps to send to + description: 'Target address for SNMP traps to send to.' + type: string + retries: + description: 'SNMP trap delivery retry limit. Default is 5' + type: integer + timeout: + description: 'Response timeout, in seconds. Default is 1' + type: integer + port: + description: 'SNMP track delivery port. Default is 162' + type: integer + alertOidLabel: + description: 'Label for finding the OID. Default is "oid"' + type: string + trapOidPrefix: + description: 'OID prefix for the trap variable bindings. Default is "1.3.6.1.4.1.50495.15"' + type: string + trapDefaultOid: + description: 'The trap OID if none is found in the Prometheus alert labels. Default is "1.3.6.1.4.1.50495.15.1.2.1"' + type: string + trapDefaultSeverity: + description: 'The trap severity if none is found in the Prometheus alert labels. Default is empty.' type: string type: object type: object @@ -175,6 +199,19 @@ spec: type: string type: object type: object + certificates: + properties: + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) of the ElasticSearch endpoint Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + caCertDuration: + description: The requested 'duration' (i.e. lifetime) of the ElasticSearch CA Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object type: object type: object logs: @@ -262,6 +299,19 @@ spec: description: Enable web interface for QDR type: boolean type: object + certificates: + properties: + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) of the QDR endpoint Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + caCertDuration: + description: The requested 'duration' (i.e. lifetime) of the QDR CA Certificate. + Minimum accepted duration is 1 hour. Value must be in units accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object type: object type: object graphing: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 7b5cfb63e..9d324839c 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -10,7 +10,15 @@ spec: receivers: snmpTraps: enabled: false + community: public target: 192.168.24.254 + retries: 5 + port: 162 + timeout: 1 + alertOidLabel: oid + trapOidPrefix: "1.3.6.1.4.1.50495.15" + trapDefaultOid: "1.3.6.1.4.1.50495.15.1.2.1" + trapDefaultSeverity: "" storage: strategy: persistent persistent: @@ -33,6 +41,9 @@ spec: strategy: persistent persistent: pvcStorageRequest: 20Gi + certificates: + endpointCertDuration: 70080h + caCertDuration: 70080h logs: loki: enabled: false @@ -94,6 +105,9 @@ spec: enabled: true web: enabled: false + certificates: + endpointCertDuration: 70080h + caCertDuration: 70080h highAvailability: enabled: false # vim: set ft=yaml shiftwidth=2 tabstop=2 expandtab: diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index eaa25d334..182dbf160 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="=v4.10" +LABEL com.redhat.openshift.versions="v4.10-v4.12" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index 8a196cf74..c275c943b 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -46,12 +46,44 @@ spec: properties: snmpTraps: properties: + alertOidLabel: + description: Label for finding the OID. Default is + "oid" + type: string + community: + description: Target community for SNMP traps. Default + is "public" + type: string enabled: description: Deploy container to send snmp traps type: boolean + port: + description: SNMP track delivery port. Default is + 162 + type: integer + retries: + description: SNMP trap delivery retry limit. Default + is 5 + type: integer target: description: Target address for SNMP traps to send - to + to. + type: string + timeout: + description: Response timeout, in seconds. Default + is 1 + type: integer + trapDefaultOid: + description: The trap OID if none is found in the + Prometheus alert labels. Default is "1.3.6.1.4.1.50495.15.1.2.1" + type: string + trapDefaultSeverity: + description: The trap severity if none is found in + the Prometheus alert labels. Default is empty. + type: string + trapOidPrefix: + description: OID prefix for the trap variable bindings. + Default is "1.3.6.1.4.1.50495.15" type: string type: object type: object @@ -92,6 +124,23 @@ spec: elasticsearch: description: Events storage backend ElasticSearch properties: + certificates: + properties: + caCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the ElasticSearch CA Certificate. Minimum accepted + duration is 1 hour. Value must be in units accepted + by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the ElasticSearch endpoint Certificate. Minimum + accepted duration is 1 hour. Value must be in units + accepted by Go time.ParseDuration https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object enabled: description: Enable ElasticSearch as a storage backend for events @@ -454,6 +503,23 @@ spec: qdr: description: QDR configuration for data transport properties: + certificates: + properties: + caCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the QDR CA Certificate. Minimum accepted duration + is 1 hour. Value must be in units accepted by Go time.ParseDuration + https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + endpointCertDuration: + description: The requested 'duration' (i.e. lifetime) + of the QDR endpoint Certificate. Minimum accepted duration + is 1 hour. Value must be in units accepted by Go time.ParseDuration + https://golang.org/pkg/time/#ParseDuration + pattern: ^((([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + type: object enabled: description: Enable QDR data transort type: boolean diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index c65b02bd9..a04701500 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -15,8 +15,16 @@ metadata: "alertmanager": { "receivers": { "snmpTraps": { + "alertOidLabel": "oid", + "community": "public", "enabled": false, - "target": "192.168.24.254" + "port": 162, + "retries": 5, + "target": "192.168.24.254", + "timeout": 1, + "trapDefaultOid": "1.3.6.1.4.1.50495.15.1.2.1", + "trapDefaultSeverity": "", + "trapOidPrefix": "1.3.6.1.4.1.50495.15" } }, "storage": { @@ -31,6 +39,10 @@ metadata: "backends": { "events": { "elasticsearch": { + "certificates": { + "caCertDuration": "70080h", + "endpointCertDuration": "70080h" + }, "enabled": false, "storage": { "persistent": { @@ -144,6 +156,10 @@ metadata: "observabilityStrategy": "use_community", "transports": { "qdr": { + "certificates": { + "caCertDuration": "70080h", + "endpointCertDuration": "70080h" + }, "enabled": true, "web": { "enabled": false @@ -161,7 +177,6 @@ metadata: description: Service Telemetry Framework. Umbrella Operator for instantiating the required dependencies and configuration of various components to build a Service Telemetry platform for telco grade monitoring. - olm.properties: '[{"type": "olm.maxOpenShiftVersion", "value": "4.10"}]' olm.skipRange: '>=<> <<>' operatorframework.io/suggested-namespace: service-telemetry operators.openshift.io/valid-subscription: '["OpenStack Platform", "Cloud Infrastructure", diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml new file mode 100644 index 000000000..8edfa0da9 --- /dev/null +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -0,0 +1,3 @@ +properties: + - type: olm.maxOpenShiftVersion + value: "4.12" diff --git a/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml b/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml new file mode 100644 index 000000000..dc0bae379 --- /dev/null +++ b/deploy/olm-catalog/service-telemetry-operator/tests/scorecard/config.yaml @@ -0,0 +1,21 @@ +kind: Configuration +apiversion: scorecard.operatorframework.io/v1alpha3 +metadata: + name: config +stages: +- parallel: true + tests: + - image: quay.io/operator-framework/scorecard-test:latest + entrypoint: + - scorecard-test + - basic-check-spec + labels: + suite: basic + test: basic-check-spec-test + - image: quay.io/operator-framework/scorecard-test:latest + entrypoint: + - scorecard-test + - olm-bundle-validation + labels: + suite: olm + test: olm-bundle-validation-test diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 2ba6d4b43..714d55471 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -24,7 +24,15 @@ servicetelemetry_defaults: receivers: snmp_traps: enabled: false + community: public target: 192.168.24.254 + retries: 5 + timeout: 1 + port: 162 + alert_oid_label: "oid" + trap_oid_prefix: "1.3.6.1.4.1.50495.15" + trap_default_oid: "1.3.6.1.4.1.50495.15.1.2.1" + trap_default_severity: "" backends: metrics: @@ -48,6 +56,9 @@ servicetelemetry_defaults: persistent: storage_class: "" pvc_storage_request: 20Gi + certificates: + endpoint_cert_duration: 70080h + ca_cert_duration: 70080h logs: loki: enabled: false @@ -73,6 +84,9 @@ servicetelemetry_defaults: deployment_size: 1 web: enabled: false + certificates: + endpoint_cert_duration: 70080h + ca_cert_duration: 70080h graphing: enabled: false diff --git a/roles/servicetelemetry/tasks/_local_signing_authority.yml b/roles/servicetelemetry/tasks/_local_signing_authority.yml index f4ae3fd62..559f0ebf3 100644 --- a/roles/servicetelemetry/tasks/_local_signing_authority.yml +++ b/roles/servicetelemetry/tasks/_local_signing_authority.yml @@ -20,7 +20,7 @@ name: '{{ ansible_operator_meta.namespace }}-ca' namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.backends.events.elasticsearch.certificates.ca_cert_duration }}' secretName: '{{ ansible_operator_meta.namespace }}-ca' commonName: '{{ ansible_operator_meta.namespace }}-ca' isCA: true @@ -50,7 +50,7 @@ name: elasticsearch-es-http namespace: '{{ ansible_operator_meta.namespace }}' spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.backends.events.elasticsearch.certificates.endpoint_cert_duration }}' commonName: elasticsearch-es-http secretName: 'elasticsearch-es-cert' dnsNames: diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index cf7cc937b..de010cafc 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -26,7 +26,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-selfsigned" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" isCA: true issuerRef: @@ -58,7 +58,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-openstack-ca" isCA: true issuerRef: @@ -75,7 +75,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-openstack-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.endpoint_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" @@ -107,7 +107,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.ca_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect-inter-router-ca" isCA: true issuerRef: @@ -124,7 +124,7 @@ name: "{{ ansible_operator_meta.name }}-interconnect-inter-router-credentials" namespace: "{{ ansible_operator_meta.namespace }}" spec: - duration: '{{ certificate_duration }}' + duration: '{{ servicetelemetry_vars.transports.qdr.certificates.endpoint_cert_duration }}' commonName: "{{ ansible_operator_meta.name }}-interconnect" dnsNames: - "{{ ansible_operator_meta.name }}-interconnect.{{ ansible_operator_meta.namespace }}.svc.cluster.local" diff --git a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 index 64292a4d2..b4a48445b 100644 --- a/roles/servicetelemetry/templates/manifest_snmp_traps.j2 +++ b/roles/servicetelemetry/templates/manifest_snmp_traps.j2 @@ -20,10 +20,20 @@ spec: - containerPort: 9099 env: - name: SNMP_COMMUNITY - value: public + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.community }}" - name: SNMP_RETRIES - value: "1" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.retries }}" - name: SNMP_HOST value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.target }}" - name: SNMP_PORT - value: "162" + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + - name: SNMP_TIMEOUT + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.port }}" + - name: ALERT_OID_LABEL + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.alert_oid_label }}" + - name: TRAP_OID_PREFIX + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_oid_prefix }}" + - name: TRAP_DEFAULT_OID + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_default_oid }}" + - name: TRAP_DEFAULT_SEVERITY + value: "{{ servicetelemetry_vars.alerting.alertmanager.receivers.snmp_traps.trap_default_severity }}" diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index aa82145cf..fbd84c1af 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -17,6 +17,15 @@ NUMCLOUDS=${NUMCLOUDS:-1} CLOUDNAMES=() OCP_PROJECT=${OCP_PROJECT:-} +OC_CLIENT_VERSION_X=$(oc version --client | grep Client | cut -f2 -d: | tr -s -d "[:space:]" - | cut -d. -f1) +OC_CLIENT_VERSION_X_REQUIRED=4 +OC_CLIENT_VERSION_Y=$(oc version --client | grep Client | cut -f2 -d: | tr -s -d "[:space:]" - | cut -d. -f2) +OC_CLIENT_VERSION_Y_REQUIRED=10 + +if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC_CLIENT_VERSION_X}" != "${OC_CLIENT_VERSION_X_REQUIRED}" ]; then + echo "*** Please install 'oc' client version ${OC_CLIENT_VERSION_X_REQUIRED}.${OC_CLIENT_VERSION_Y_REQUIRED} or later ***" + exit 1 +fi CLEANUP=${CLEANUP:-true} @@ -59,7 +68,7 @@ for NAME in "${CLOUDNAMES[@]}"; do done echo "*** [INFO] Triggering an alertmanager notification..." -PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) +PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" # it takes some time to get the alert delivered, continuing with other tests @@ -146,7 +155,7 @@ oc logs "$(oc get pod -l app=default-snmp-webhook -o jsonpath='{.items[0].metada echo echo "*** [INFO] Logs from alertmanager..." -oc logs "$(oc get pod -l app=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager +oc logs "$(oc get pod -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].metadata.name}')" -c alertmanager echo echo "*** [INFO] Cleanup resources..." diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index dcf055faa..50735b6a5 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -21,9 +21,9 @@ spec: - name: CLOUDNAME value: <> - name: ELASTICSEARCH_AUTH_PASS - value: <> + value: "<>" - name: PROMETHEUS_AUTH_PASS - value: <> + value: "<>" volumeMounts: - name: collectd-config mountPath: /etc/minimal-collectd.conf.template @@ -48,9 +48,9 @@ spec: - name: CLOUDNAME value: <> - name: ELASTICSEARCH_AUTH_PASS - value: <> + value: "<>" - name: PROMETHEUS_AUTH_PASS - value: <> + value: "<>" volumeMounts: - name: ceilometer-publisher mountPath: /ceilometer_publish.py From 59f04423cc9883e1e8735674b2153f2b55c7ed14 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 8 Mar 2023 12:51:40 -0500 Subject: [PATCH 09/28] Fix default CA cert lifetime values in stf-run-ci (#414) (cherry picked from commit e3a1125f8af59c4028e63f2080fea04eca9f5590) --- build/stf-run-ci/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index 9b54f0ce3..ab9dab7a6 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -10,7 +10,7 @@ __deploy_from_bundles_enabled: false __deploy_stf: true __service_telemetry_events_certificates_endpoint_cert_duration: 70080h -__service_telemetry_events_certificates_ca_cert_duration: 2160h +__service_telemetry_events_certificates_ca_cert_duration: 70080h __service_telemetry_events_enabled: true __service_telemetry_high_availability_enabled: false __service_telemetry_metrics_enabled: true @@ -28,7 +28,7 @@ __service_telemetry_snmptraps_trap_default_severity: "" __service_telemetry_logs_enabled: false __service_telemetry_observability_strategy: use_community __service_telemetry_transports_certificates_endpoint_cert_duration: 70080h -__service_telemetry_transports_certificates_ca_cert_duration: 2160h +__service_telemetry_transports_certificates_ca_cert_duration: 70080h __internal_registry_path: image-registry.openshift-image-registry.svc:5000 __service_telemetry_bundle_image_path: __smart_gateway_bundle_image_path: From 2085c152bcb4b78899e40f592d18be2b1d747bd9 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Wed, 8 Mar 2023 16:51:44 -0500 Subject: [PATCH 10/28] fix/client version mismatch (#415) * Fix PROMETHEUS_K8S_TOKEN to account for oc version mismatch * Fix to use correct variable name in test (cherry picked from commit 7c66ed859958abd77513f54f9bc47b2fa9d03271) --- tests/smoketest/smoketest.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index fbd84c1af..8a801c004 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -68,12 +68,18 @@ for NAME in "${CLOUDNAMES[@]}"; do done echo "*** [INFO] Triggering an alertmanager notification..." -PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) + +# check if the oc client version is less than 4.11 and adjust the token command to match available commands +if [ 0${OC_CLIENT_VERSION_Y} -lt 011 ]; then + PROMETHEUS_K8S_TOKEN=$(oc serviceaccounts get-token prometheus-k8s) +else + PROMETHEUS_K8S_TOKEN=$(oc create token prometheus-k8s) +fi + oc run curl --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"labels\":{\"alertname\":\"Testalert1\"}}]' https://default-alertmanager-proxy:9095/api/v1/alerts" # it takes some time to get the alert delivered, continuing with other tests - # Trying to find a less brittle test than a timeout JOB_TIMEOUT=300s for NAME in "${CLOUDNAMES[@]}"; do From c1fc1dbd5e9504427f49ff408fe9d3c29cd874a2 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 9 Mar 2023 09:30:38 -0500 Subject: [PATCH 11/28] Allow oc client version override for Jenkins agent (#416) (#417) (cherry picked from commit 4d2f34842eac2fbad71c2fb0d4f8fb2818b12d68) --- .jenkins/agent/Dockerfile | 6 +++++- .jenkins/agent/README.md | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/.jenkins/agent/Dockerfile b/.jenkins/agent/Dockerfile index fe1a7fa70..c41fb9c69 100644 --- a/.jenkins/agent/Dockerfile +++ b/.jenkins/agent/Dockerfile @@ -1,4 +1,8 @@ FROM quay.io/openshift/origin-jenkins-agent-base:latest + +# pass --build-arg OC_CLIENT_VERSION= to build stage to change client version +ARG OC_CLIENT_VERSION="4.12" + RUN curl -LO "https://github.com/operator-framework/operator-sdk/releases/download/v0.19.4/operator-sdk-v0.19.4-x86_64-linux-gnu" && \ chmod +x operator-sdk-v0.19.4-x86_64-linux-gnu && mv operator-sdk-v0.19.4-x86_64-linux-gnu /usr/local/bin/operator-sdk RUN dnf install -y ansible golang python38 && \ @@ -6,5 +10,5 @@ RUN dnf install -y ansible golang python38 && \ alternatives --set python /usr/bin/python3.8 && \ python -m pip install openshift kubernetes "ansible-core~=2.12" && \ ansible-galaxy collection install -f 'kubernetes.core:>=2.2.0' community.general -RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-4.11/openshift-client-linux.tar.gz" && \ +RUN curl -LO "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest-$OC_CLIENT_VERSION/openshift-client-linux.tar.gz" && \ tar -xv -C /usr/local/bin -f openshift-client-linux.tar.gz diff --git a/.jenkins/agent/README.md b/.jenkins/agent/README.md index 5eaf16527..dbef269fc 100644 --- a/.jenkins/agent/README.md +++ b/.jenkins/agent/README.md @@ -1,13 +1,29 @@ The Jenkins agent pod is used to run all Jenkins pipelines for the Service Telemetry Framework. # Build in OpenShift + ```bash oc new-build --binary=true --name=jenkins-agent oc start-build jenkins-agent --from-dir . ``` + +You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. + +```bash +oc new-build --build-arg OC_CLIENT_VERSION=4.10 --binary=true --name=jenkins-agent +oc start-build jenkins-agent --from-dir . +``` + Builds will be available in-cluster at the address: `image-registry.openshift-image-registry.svc:5000//jenkins-agent:latest` # Build with Podman/Docker + ```bash podman build -t jenkins-agent:latest . ``` + +You can override the default `oc` client version being installed by overriding the default argument `OC_CLIENT_VERSION` from the `Dockerfile`. + +```bash +podman build --build-arg OC_CLIENT_VERSION=4.10 -t jenkins-agent:latest . +``` From 8693f4632e8976d33f8db24dcef32f07b4b5bb68 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 26 Oct 2023 10:03:53 -0400 Subject: [PATCH 12/28] Use stable-1.5 channel in the stable-1.5 branch --- .github/workflows/main.yml | 2 +- build/stf-run-ci/tasks/main.yml | 2 +- build/stf-run-ci/tasks/setup_stf.yml | 6 +++--- build/update_csv.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2b04f59a5..f7269fea9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -58,7 +58,7 @@ jobs: run: sudo mv operator-sdk /usr/local/bin - name: Generate bundle locally - run: operator-sdk generate bundle --manifests --metadata --default-channel unstable --channels unstable + run: operator-sdk generate bundle --manifests --metadata --default-channel stable-1.5 --channels stable-1.5 - name: Check if bundle generation results in local changes run: git diff --exit-code diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index bd0821959..cf2b0a880 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -187,7 +187,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: service-telemetry-operator source: service-telemetry-framework-operators diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index e76eb1734..ce4713931 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -9,7 +9,7 @@ namespace: openshift-marketplace spec: displayName: InfraWatch Operators - image: quay.io/infrawatch-operators/infrawatch-catalog:unstable + image: quay.io/infrawatch-operators/infrawatch-catalog:stable-1.5 publisher: InfraWatch sourceType: grpc updateStrategy: @@ -26,7 +26,7 @@ name: smart-gateway-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: smart-gateway-operator source: infrawatch-operators @@ -42,7 +42,7 @@ name: service-telemetry-operator namespace: "{{ namespace }}" spec: - channel: unstable + channel: stable-1.5 installPlanApproval: Automatic name: service-telemetry-operator source: infrawatch-operators diff --git a/build/update_csv.sh b/build/update_csv.sh index 4e15f1fa3..172653dd4 100755 --- a/build/update_csv.sh +++ b/build/update_csv.sh @@ -3,4 +3,4 @@ # Run this script from the root directory to update the CSV whenever changes # are made to /deploy/crds/. Changes are written to # /deploy/olm-manifests/service-telemetry-operator/. -operator-sdk generate bundle --channels unstable --default-channel unstable +operator-sdk generate bundle --channels stable-1.5 --default-channel stable-1.5 From e50dc5fdd023c3fe37d71196d7abdaadf923ea90 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Fri, 27 Oct 2023 09:46:34 -0400 Subject: [PATCH 13/28] Drop .zuul.yaml for the stable-1.5 branch Drop .zuul.yaml for stable-1.5 since it's not setup for non-main testing at this point. In the future we may develop a separate set of tests for the stable-1.5 branch during merge, but not for this initial import. We'll rely on Jenkins testing for our functional validations. --- .zuul.yaml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 .zuul.yaml diff --git a/.zuul.yaml b/.zuul.yaml deleted file mode 100644 index 91d848359..000000000 --- a/.zuul.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -- job: - name: stf-base - # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml - parent: base-simple-crc - abstract: true - description: | - Run the stf-run-ci role, and then test stf - roles: # adds in dependent roles i.e. put it in the role path - - zuul: github.com/openstack-k8s-operators/ci-framework - # These are the additional repos that zuul will clone - required-projects: - - name: openstack-k8s-operators/ci-framework - override-checkout: main - pre-run: - - ci/prepare.yml - run: - - ci/deploy_stf.yml - - ci/test_stf.yml - post-run: - - ci/post-collect_logs.yml - nodeset: centos-9-crc-xxl - # The default (~30 minutes) is not enough to run through all the job stages - timeout: 3600 - vars: - # Pass vars to crc cli https://review.rdoproject.org/cgit/config/tree/playbooks/crc/simple-start.yaml#n30 - crc_parameters: '--memory 16000 --disk-size 80 --cpus 6' # Increase from 14336 - -- job: - name: stf-crc-latest-nightly_bundles - parent: stf-base - description: - Deploy STF nightly bundles - vars: - scenario: "nightly_bundles" - -- job: - name: stf-crc-latest-local_build - parent: stf-base - description: | - Build images locally and deploy STF - vars: - scenario: "local_build" - -- project: - name: infrawatch/service-telemetry-operator - github-check: - jobs: - - stf-crc-latest-nightly_bundles - - stf-crc-latest-local_build From b063b92a9cae22c9766b9cd22e9bed11de029634 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 6 Nov 2023 09:46:52 -0500 Subject: [PATCH 14/28] Fix qdr auth one_time_upgrade label check (#518) (#521) * Fix qdr auth one_time_upgrade label check * Fix incorrect variable naming on one_time_upgrade label check * Adjust QDR authentication password generation (#520) Adjust the passwords being generated for QDR authentication since certain characters (such as colon) will cause a failure in the parsing routine within qpid-dispatch. Updates the lookup function to only use ascii_letters and digits and increases the length to 32 characters. --------- Co-authored-by: Leif Madsen --- roles/servicetelemetry/tasks/component_qdr.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 26c210cfa..885bc3356 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -157,9 +157,9 @@ block: - name: Get QDR BasicAuth secret k8s_info: - api_version: interconnectedcloud.github.io/v1alpha1 - kind: Interconnect - name: "{{ ansible_operator_meta.name }}-interconnect" + api_version: v1 + kind: Secret + name: "{{ ansible_operator_meta.name }}-interconnect-users" namespace: "{{ ansible_operator_meta.namespace }}" register: _qdr_basicauth_object @@ -175,9 +175,9 @@ labels: stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" stringData: - guest: "{{ lookup('password', '/dev/null') }}" + guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" when: - - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object[0].metadata.labels.stf_one_time_upgrade is not defined + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined - name: Set default Interconnect manifest set_fact: From c8966879c0905ff5eac92304aec86303f8cc8c3e Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Wed, 8 Nov 2023 12:40:16 -0500 Subject: [PATCH 15/28] QDR Auth in smoketest (#525) (#527) * QDR Auth in smoketest * Added qdr-test as a mock of the OSP-side QDR * Connection from qdr-test -> default-interconnect is TLS+Auth * Collectors point at qdr-test instead of default-interconnect directly * Much more realistic than the existing setup * Eliminated a substitution in sensubility config * Used default QDR basic auth in Jenkinsfile (cherry picked from commit 37b6f035d6ff44a39598aacf812b3b893bafda7e) --- Jenkinsfile | 1 - tests/smoketest/collectd-sensubility.conf | 2 +- .../smoketest/minimal-collectd.conf.template | 4 +- tests/smoketest/qdr-test.conf.yaml.template | 66 +++++++++++++++++++ tests/smoketest/qdr-test.yaml | 52 +++++++++++++++ tests/smoketest/smoketest.sh | 20 +++--- .../smoketest_ceilometer_entrypoint.sh | 6 +- 7 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 tests/smoketest/qdr-test.conf.yaml.template create mode 100644 tests/smoketest/qdr-test.yaml diff --git a/Jenkinsfile b/Jenkinsfile index f3a13d571..f94b64b1e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -36,7 +36,6 @@ spec: strategy: ephemeral transports: qdr: - auth: none enabled: true deploymentSize: 1 web: diff --git a/tests/smoketest/collectd-sensubility.conf b/tests/smoketest/collectd-sensubility.conf index 0cc773f21..4604e2e85 100644 --- a/tests/smoketest/collectd-sensubility.conf +++ b/tests/smoketest/collectd-sensubility.conf @@ -10,7 +10,7 @@ worker_count=2 checks={"check-container-health":{"command":"cat /healthcheck.log","handlers":[],"interval":3,"occurrences":3,"refresh":90,"standalone":true}} [amqp1] -connection=amqp://default-interconnect.<>.svc:5671 +connection=amqp://qdr-test:5672 results_channel=sensubility/cloud1-telemetry client_name=smoketest.redhat.com results_format=smartgateway diff --git a/tests/smoketest/minimal-collectd.conf.template b/tests/smoketest/minimal-collectd.conf.template index e6cf09189..ac0a6475a 100644 --- a/tests/smoketest/minimal-collectd.conf.template +++ b/tests/smoketest/minimal-collectd.conf.template @@ -11,8 +11,8 @@ LoadPlugin cpu LoadPlugin amqp1 - Host "default-interconnect" - Port "5671" + Host "qdr-test" + Port "5672" Address "collectd" Format JSON diff --git a/tests/smoketest/qdr-test.conf.yaml.template b/tests/smoketest/qdr-test.conf.yaml.template new file mode 100644 index 000000000..24b758214 --- /dev/null +++ b/tests/smoketest/qdr-test.conf.yaml.template @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: qdr-test-config +data: + qdrouterd.conf: | + router { + mode: edge + id: qdr-test.smoketest + workerThreads: 2 + saslConfigDir: /etc/sasl2 + saslConfigName: qdrouterd + } + + sslProfile { + name: sslProfile + caCertFile: /etc/pki/tls/certs/ca.crt + } + + listener { + host: 0.0.0.0 + port: 5672 + authenticatePeer: false + saslMechanisms: ANONYMOUS + } + + connector { + host: default-interconnect + port: 5671 + role: edge + saslPassword: pass:<> + saslUsername: guest@default-interconnect + sslProfile: sslProfile + verifyHostname: false + } + + address { + prefix: unicast + distribution: closest + } + + address { + prefix: exclusive + distribution: closest + } + + address { + prefix: broadcast + distribution: multicast + } + + address { + distribution: multicast + prefix: collectd + } + + address { + distribution: multicast + prefix: anycast/ceilometer + } + + log { + module: DEFAULT + enable: info+ + includeTimestamp: true + } diff --git a/tests/smoketest/qdr-test.yaml b/tests/smoketest/qdr-test.yaml new file mode 100644 index 000000000..3e6366cc2 --- /dev/null +++ b/tests/smoketest/qdr-test.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + openshift.io/scc: restricted-v2 + name: qdr-test + labels: + qdr: qdr-test +spec: + containers: + - name: qdr + image: quay.io/tripleowallabycentos9/openstack-qdrouterd:current-tripleo + imagePullPolicy: IfNotPresent + command: ['/usr/sbin/qdrouterd','-c','/etc/qpid-dispatch/qdrouterd.conf'] + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + ports: + - containerPort: 5672 + name: amqp + protocol: TCP + volumeMounts: + - mountPath: /etc/pki/tls/certs/ + name: default-interconnect-selfsigned-cert + - mountPath: /etc/qpid-dispatch/ + name: qdr-test-config + resources: {} + volumes: + - name: default-interconnect-selfsigned-cert + secret: + defaultMode: 420 + secretName: default-interconnect-selfsigned + - name: qdr-test-config + configMap: + defaultMode: 420 + name: qdr-test-config + +--- + +apiVersion: v1 +kind: Service +metadata: + name: qdr-test +spec: + ports: + - name: amqp + port: 5672 + targetPort: amqp + selector: + qdr: qdr-test diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 2909e694f..29510a837 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -27,13 +27,6 @@ if [ "${OC_CLIENT_VERSION_Y}" -lt "${OC_CLIENT_VERSION_Y_REQUIRED}" ] || [ "${OC exit 1 fi -if [ "$(oc get stf default -o=jsonpath='{.spec.transports.qdr.auth}')" != "none" ]; then - echo "*** QDR authentication is currently not supported in smoketests." - echo "To disable it, use: oc patch stf default --patch '{\"spec\":{\"transports\":{\"qdr\":{\"auth\":\"none\"}}}}' --type=merge" - echo "For more info: https://github.com/infrawatch/service-telemetry-operator/pull/492" - exit 1 -fi - CLEANUP=${CLEANUP:-true} SMOKETEST_VERBOSE=${SMOKETEST_VERBOSE:-true} @@ -57,18 +50,23 @@ ELASTICSEARCH_AUTH_PASS=$(oc get secret elasticsearch-es-elastic-user -ogo-templ echo "*** [INFO] Getting Prometheus authentication password" PROMETHEUS_AUTH_PASS=$(oc get secret default-prometheus-htpasswd -ogo-template='{{ .data.password | base64decode }}') -echo "*** [INFO] Setting namepsace for collectd-sensubility config" -sed "s/<>/${OCP_PROJECT}/g" "${REL}/collectd-sensubility.conf" > /tmp/collectd-sensubility.conf - echo "*** [INFO] Creating configmaps..." oc delete configmap/stf-smoketest-healthcheck-log configmap/stf-smoketest-collectd-config configmap/stf-smoketest-sensubility-config configmap/stf-smoketest-collectd-entrypoint-script configmap/stf-smoketest-ceilometer-publisher configmap/stf-smoketest-ceilometer-entrypoint-script job/stf-smoketest || true oc create configmap stf-smoketest-healthcheck-log --from-file "${REL}/healthcheck.log" oc create configmap stf-smoketest-collectd-config --from-file "${REL}/minimal-collectd.conf.template" -oc create configmap stf-smoketest-sensubility-config --from-file /tmp/collectd-sensubility.conf +oc create configmap stf-smoketest-sensubility-config --from-file "${REL}/collectd-sensubility.conf" oc create configmap stf-smoketest-collectd-entrypoint-script --from-file "${REL}/smoketest_collectd_entrypoint.sh" oc create configmap stf-smoketest-ceilometer-publisher --from-file "${REL}/ceilometer_publish.py" oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${REL}/smoketest_ceilometer_entrypoint.sh" +echo "*** [INFO] Creating Mock OSP Metrics QDR router..." +oc delete pod qdr-test +oc delete service qdr-test +oc delete configmap qdr-test-config +AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d) +oc create -f <(sed -e "s/<>/${AMQP_PASS}/;" "${REL}/qdr-test.conf.yaml.template") +oc create -f "${REL}/qdr-test.yaml" + echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index 8e2ac7f6f..adf3a9046 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -13,11 +13,11 @@ POD=$(hostname) echo "*** [INFO] My pod is: ${POD}" # Run ceilometer_publisher script -python3 /ceilometer_publish.py default-interconnect:5671 'driver=amqp&topic=cloud1-metering' 'driver=amqp&topic=cloud1-event' +python3 /ceilometer_publish.py qdr-test:5672 'driver=amqp&topic=cloud1-metering' 'driver=amqp&topic=cloud1-event' # Sleeping to produce data -echo "*** [INFO] Sleeping for 20 seconds to produce all metrics and events" -sleep 20 +echo "*** [INFO] Sleeping for 30 seconds to produce all metrics and events" +sleep 30 echo "*** [INFO] List of metric names for debugging..." curl -sk -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names From 94e343c938d60f55b81b85a3590e23e79284badb Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 13 Nov 2023 15:34:18 -0500 Subject: [PATCH 16/28] QDR Auth for infrared 17.1 script (#517) (#528) * QDR Auth for infrared 17.1 script * Fix missing substitution for AMQP_PASS in infrared script (cherry picked from commit d12aa38b1950bd5158ec5f9ba4f95ca6c24500c0) --- tests/infrared/17.1/README.md | 1 + tests/infrared/17.1/infrared-openstack.sh | 12 ++++++++---- tests/infrared/17.1/stf-connectors.yaml.template | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md index 15bcf37a9..0db5c0734 100644 --- a/tests/infrared/17.1/README.md +++ b/tests/infrared/17.1/README.md @@ -6,6 +6,7 @@ OCP_ROUTE_IP="10.0.100.50" \ CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ +AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ ENABLE_STF_CONNECTORS=true \ ENABLE_GNOCCHI_CONNECTORS=false \ CONTROLLER_MEMORY="24000" \ diff --git a/tests/infrared/17.1/infrared-openstack.sh b/tests/infrared/17.1/infrared-openstack.sh index 9743a0081..a55de8894 100755 --- a/tests/infrared/17.1/infrared-openstack.sh +++ b/tests/infrared/17.1/infrared-openstack.sh @@ -1,12 +1,11 @@ #!/usr/bin/env bash set -e -# Usage: -# VIRTHOST=my.big.hypervisor.net -# ./infrared-openstack.sh +# Usage: See README.md VIRTHOST=${VIRTHOST:-localhost} AMQP_HOST=${AMQP_HOST:-stf-default-interconnect-5671-service-telemetry.apps-crc.testing} AMQP_PORT=${AMQP_PORT:-443} +AMQP_PASS=${AMQP_PASS:-} SSH_KEY="${SSH_KEY:-${HOME}/.ssh/id_rsa}" NTP_SERVER="${NTP_SERVER:-clock.redhat.com,10.5.27.10,10.11.160.238}" CLOUD_NAME="${CLOUD_NAME:-cloud1}" @@ -97,7 +96,7 @@ ir_create_undercloud() { } stf_create_config() { - sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml + sed -r "s/<>/${AMQP_HOST}/;s/<>/${AMQP_PORT}/;s/<>/${AMQP_PASS}/;s/<>/${CLOUD_NAME}/;s%<>%${CA_CERT_FILE_CONTENT//$'\n'/<@@@>}%;s/<@@@>/\n /g" ${STF_ENVIRONMENT_TEMPLATE} > outputs/stf-connectors.yaml } gnocchi_create_config() { @@ -167,6 +166,11 @@ if [ -z "${CA_CERT_FILE_CONTENT}" ]; then exit 1 fi +if [ -z "${AMQP_PASS}" ]; then + echo "AMQP_PASS must be set and passed to the deployment, or QDR will fail to connect." + exit 1 +fi + time if ${TEMPEST_ONLY}; then echo "-- Running tempest tests" ir_run_tempest diff --git a/tests/infrared/17.1/stf-connectors.yaml.template b/tests/infrared/17.1/stf-connectors.yaml.template index 1031e097b..1dfa26827 100644 --- a/tests/infrared/17.1/stf-connectors.yaml.template +++ b/tests/infrared/17.1/stf-connectors.yaml.template @@ -16,6 +16,8 @@ custom_templates: role: edge verifyHostname: false sslProfile: sslProfile + saslUsername: guest@default-interconnect + saslPassword: pass:<> MetricsQdrSSLProfiles: - name: sslProfile From edebfb5b9be7d9d322fce4d6af725af41a788cf7 Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Mon, 13 Nov 2023 20:50:45 -0500 Subject: [PATCH 17/28] Restart QDR after changing the password (#530) (#534) * Restart QDR after changing the password * Fixes bug reported here: https://github.com/infrawatch/service-telemetry-operator/pull/517#issuecomment-1794919985 * Avoids an extra manual step when changing password * Would affect users who upgrade from earlier STF and subsequently enable basic auth * Also users who need to change their passwords * Fixing ansible lint * Update roles/servicetelemetry/tasks/component_qdr.yml * Adjust QDR restarts to account for HA * [smoketest] Wait for qdr-test to be Running * [smoketest] Wait for QDR password upgrade * Remove zuul QDR auth override (cherry picked from commit 16b8197ed3d0413f652c73a8e309f88f46d635ac) --- ci/vars-zuul-common.yml | 1 - .../servicetelemetry/tasks/component_qdr.yml | 48 +++++++++++++------ tests/smoketest/smoketest.sh | 10 +++- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index dfd64e7ad..39d43a29d 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -2,6 +2,5 @@ namespace: "service-telemetry" setup_bundle_registry_tls_ca: false setup_bundle_registry_auth: false -__service_telemetry_transports_qdr_auth: none base_dir: "{{ sto_dir }}/build" logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" diff --git a/roles/servicetelemetry/tasks/component_qdr.yml b/roles/servicetelemetry/tasks/component_qdr.yml index 885bc3356..7e26e567f 100644 --- a/roles/servicetelemetry/tasks/component_qdr.yml +++ b/roles/servicetelemetry/tasks/component_qdr.yml @@ -163,21 +163,41 @@ namespace: "{{ ansible_operator_meta.namespace }}" register: _qdr_basicauth_object - # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 - - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth - k8s: - definition: - kind: Secret - apiVersion: v1 - metadata: - name: "{{ ansible_operator_meta.name }}-interconnect-users" + - when: + - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined + block: + # Because https://github.com/interconnectedcloud/qdr-operator/blob/576d2b33dac71437ea2b165caaaf6413220767fe/pkg/controller/interconnect/interconnect_controller.go#L634 + - name: Perform a one-time upgrade to the default generated password for QDR BasicAuth + k8s: + definition: + kind: Secret + apiVersion: v1 + metadata: + name: "{{ ansible_operator_meta.name }}-interconnect-users" + namespace: "{{ ansible_operator_meta.namespace }}" + labels: + stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" + stringData: + guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" + + # label_selectors on the k8s object need kubernetes.core>=2.2.0 + - name: Get the list of QDR pods + k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ ansible_operator_meta.namespace }}" + label_selectors: + - application={{ ansible_operator_meta.name }}-interconnect + register: _qdr_pod + + - name: Restart QDR pods to pick up new password + k8s: + state: absent + api_version: v1 + kind: Pod namespace: "{{ ansible_operator_meta.namespace }}" - labels: - stf_one_time_upgrade: "{{ lookup('pipe', 'date +%s') }}" - stringData: - guest: "{{ lookup('password', '/dev/null chars=ascii_letters,digits length=32') }}" - when: - - _qdr_basicauth_object.resources[0] is defined and _qdr_basicauth_object.resources[0].metadata.labels.stf_one_time_upgrade is not defined + name: "{{ item.metadata.name }}" + loop: "{{ _qdr_pod.resources }}" - name: Set default Interconnect manifest set_fact: diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 29510a837..caaeb4e88 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -59,14 +59,20 @@ oc create configmap stf-smoketest-collectd-entrypoint-script --from-file "${REL} oc create configmap stf-smoketest-ceilometer-publisher --from-file "${REL}/ceilometer_publish.py" oc create configmap stf-smoketest-ceilometer-entrypoint-script --from-file "${REL}/smoketest_ceilometer_entrypoint.sh" -echo "*** [INFO] Creating Mock OSP Metrics QDR router..." +echo "*** [INFO] Waiting for QDR password upgrade" +AMQP_PASS='' +while [ ${#AMQP_PASS} -lt 32 ]; do AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d); sleep 3; done + +echo "*** [INFO] Creating Mock OSP Metrics QDR..." oc delete pod qdr-test oc delete service qdr-test oc delete configmap qdr-test-config -AMQP_PASS=$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d) oc create -f <(sed -e "s/<>/${AMQP_PASS}/;" "${REL}/qdr-test.conf.yaml.template") oc create -f "${REL}/qdr-test.yaml" +echo -e "\n* [INFO] Waiting for OSP Metrics QDR pod to be Running\n" +oc wait --for=jsonpath='{.status.phase}'=Running pod/qdr-test + echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do From 03269cd5827e35c2ab83443d1ee6501ef20266f3 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Thu, 16 Nov 2023 13:37:52 -0500 Subject: [PATCH 18/28] Support OCP v4.12 through v4.14 (#535) (#536) Support STF 1.5.3 starting at OpenShift version 4.12 due to incompatibility with 4.11 due to dependency requirements. Our primary target is support of OCP EUS releases. Closes: STF-1632 (cherry picked from cba3874b69a20e72ce8939ac49c9616c16711392) --- deploy/olm-catalog/service-telemetry-operator/Dockerfile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in index cbe2ccbf3..871edc3c1 100644 --- a/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in +++ b/deploy/olm-catalog/service-telemetry-operator/Dockerfile.in @@ -13,7 +13,7 @@ LABEL operators.operatorframework.io.metrics.mediatype.v1=metrics+v1 LABEL operators.operatorframework.io.metrics.builder=operator-sdk-v0.19.4 LABEL operators.operatorframework.io.metrics.project_layout=ansible LABEL com.redhat.delivery.operator.bundle=true -LABEL com.redhat.openshift.versions="v4.11-v4.14" +LABEL com.redhat.openshift.versions="v4.12-v4.14" LABEL com.redhat.delivery.backport=false LABEL com.redhat.component="service-telemetry-operator-bundle-container" \ From b321394efb5c3d04b21ece843e5e0f35a8a06744 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 27 Nov 2023 13:26:45 -0500 Subject: [PATCH 19/28] Adjust Operator dependency version requirements (#538) (#543) * Adjust Operator dependency version requirements (#538) Adjust the operator package dependency requirements to align to known required versions. Primarily reduce the version of openshift-cert-manager from 1.10 to 1.7 in order to support the tech-preview channel which was previously used. Lowering the version requirement allows for the openshift-cert-manager-operator installed previously to be used during the STF 1.5.2 to 1.5.3 update, removing the update from being blocked. Related: STF-1636 (cherry picked from commit 77dea87dec0656c48d0c7e0b9a09d0fe826425aa) * Only require Interconnect and Smart Gateway (#541) * Only require Interconnect and Smart Gateway Update the dependency management within Service Telemetry Operator to only require AMQ Interconnect and Smart Gateway Operator, which is enough to deploy STF with observabilityStrategy: none. Other Operators can be installed in order to satisfy data storage of telemetry and events. Installation of cert-manager is also required, but needs to be pre-installed similar to Cluster Observability Operator, either as a cluster-scoped operator with the tech-preview channel, or a single time on the cluster as a namespace scoped operator, which is how the stable-v1 channel installs. Documentation will be updated to adjust for this change. Related: STF-1636 * Perform CI update to match docs install changes (#542) * Perform CI update to match docs install changes Update the stf-run-ci scripting to match the documented installation procedures which landed in https://github.com/infrawatch/documentation/pull/513. These changes are also reflected in #541. * Update build/stf-run-ci/tasks/setup_base.yml Co-authored-by: Emma Foley --------- Co-authored-by: Emma Foley * Also drop cert-manager project The cert-manager project gets created with workload items when deploying the cert-manager from the cert-manager-operator project. When removing cert-manager this project is not cleaned up, so we need to delete it as well. --------- Co-authored-by: Emma Foley (cherry picked from commit ba9c918ec3e5344a599a911c8ef616ae88c55227) --- build/stf-run-ci/tasks/create_catalog.yml | 2 +- build/stf-run-ci/tasks/pre-clean.yml | 7 +- build/stf-run-ci/tasks/preflight_checks.yml | 4 + build/stf-run-ci/tasks/setup_base.yml | 200 +++++++++--------- build/stf-run-ci/tasks/setup_stf.yml | 15 +- .../metadata/properties.yaml | 23 -- 6 files changed, 114 insertions(+), 137 deletions(-) diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 6a464afd9..6eb6b49df 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -170,4 +170,4 @@ securityContextConfig: legacy updateStrategy: registryPoll: - interval: 1m + interval: 5m diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 8e6df8bef..712d188bf 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -122,7 +122,6 @@ name: smart-gateway-operator-catalog namespace: "{{ namespace }}" -# Remove the cert manager since we install it as part of the CI/documented pre-install process - name: Remove openshift-cert-manager-operator namespace kubernetes.core.k8s: state: absent @@ -131,7 +130,11 @@ apiVersion: project.openshift.io/v1 kind: Project metadata: - name: openshift-cert-manager-operator + name: "{{ item }}" + loop: + - openshift-cert-manager-operator + - cert-manager-operator + - cert-manager - name: Remove Elasticsearch ignore_errors: true diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 5c68b5405..870931789 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -21,6 +21,10 @@ oc describe csv $(oc get csv | grep "service-telemetry-operator" | awk '{print $1}') > {{ logfile_dir }}/oc_get_csv_sto.log 2>&1 cat {{ logfile_dir }} + - name: "Show service-telemetry-operator CSV information" + ansible.builtin.debug: + var: csv_sto.stdout + - name: "Show fail message if CSV isn't Succeeded after the alotted time" ansible.builtin.fail: msg: "Service Telemetry Operator CSV not Succeeded after 10 minutes. Check {{ logfile_dir }}/oc_get_csv_sto.log for more information" diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index cf9c92fdf..6f60de71f 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -16,6 +16,7 @@ - disabled: false name: community-operators +# documented procedure: https://infrawatch.github.io/documentation/#deploying-service-telemetry-operator_assembly-installing-the-core-components-of-stf - name: Create OperatorGroup for service-telemetry kubernetes.core.k8s: definition: @@ -28,65 +29,88 @@ targetNamespaces: - "{{ namespace }}" -# deploy cert-manager from tech-preview when using versions of OCP < 4.12 -- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '<') - block: - - name: Create openshift-cert-manager-operator namespace - kubernetes.core.k8s: - definition: - apiVersion: project.openshift.io/v1 - kind: Project - metadata: - name: openshift-cert-manager-operator - spec: - finalizers: - - kubernetes +# documented procedure: https://infrawatch.github.io/documentation/#deploying-observability-operator_assembly-installing-the-core-components-of-stf +- name: Subscribe to Red Hat Obervability Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/observability-operator.openshift-operators: "" + name: observability-operator + namespace: openshift-operators + spec: + channel: stable + installPlanApproval: Automatic + name: observability-operator + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] - - name: Create openshift-cert-manager-operator OperatorGroup - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: {} +# undocumented procedure, used for testing updates or old deployment models +- name: Subscribe to Prometheus Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: prometheus + namespace: "{{ namespace }}" + spec: + channel: beta + installPlanApproval: Automatic + name: prometheus + source: community-operators + sourceNamespace: openshift-marketplace + when: + - __service_telemetry_observability_strategy == "use_community" - - name: Subscribe to Cert Manager for OpenShift Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - spec: - channel: "tech-preview" - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace +# documented procedure: https://infrawatch.github.io/documentation/#deploying-certificate-manager-for-openshift-operator_assembly-installing-the-core-components-of-stf +- block: + - name: Create project for cert-manager for Red Hat OpenShift + kubernetes.core.k8s: + definition: + apiVersion: project.openshift.io/v1 + kind: Project + metadata: + name: cert-manager-operator + spec: + finalizers: + - kubernetes -# deploy cert-manager from stable-v1 in 4.12 and later using namespace scoped operator -- when: not __deploy_from_index_enabled | bool and ocp_ver.stdout is version ('4.12', '>=') - block: - - name: Subscribe to Cert Manager for OpenShift Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - labels: - operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" - name: openshift-cert-manager-operator-stable-v1-redhat-operators-openshift-marketplace - namespace: "{{ namespace }}" - spec: - channel: stable-v1 - installPlanApproval: Automatic - name: openshift-cert-manager-operator - source: redhat-operators - sourceNamespace: openshift-marketplace + - name: Create OperatorGroup for cert-manager for Red hat OpenShift + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: cert-manager-operator + namespace: cert-manager-operator + spec: + targetNamespaces: + - cert-manager-operator + upgradeStrategy: Default + + - name: Subscribe to cert-manager for Red Hat OpenShift Operator + kubernetes.core.k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + labels: + operators.coreos.com/openshift-cert-manager-operator.service-telemetry: "" + name: openshift-cert-manager-operator + namespace: cert-manager-operator + spec: + channel: stable-v1 + installPlanApproval: Automatic + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace +# installed by properties.yaml definition as of STF 1.5.3 - when: not __deploy_from_index_enabled | bool block: - name: Subscribe to AMQ Interconnect Operator @@ -104,63 +128,29 @@ source: redhat-operators sourceNamespace: openshift-marketplace - - name: Subscribe to Prometheus Operator +# undocumented procedure: used for backwards compatilibity verification +- block: + - name: Subscribe to Elastic Cloud on Kubernetes Operator kubernetes.core.k8s: definition: apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: - name: prometheus + name: elasticsearch-eck-operator-certified namespace: "{{ namespace }}" spec: - channel: beta + channel: stable installPlanApproval: Automatic - name: prometheus - source: community-operators + name: elasticsearch-eck-operator-certified + source: certified-operators sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy == "use_community" - -- name: Subscribe to Red Hat Obervability Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - labels: - operators.coreos.com/observability-operator.openshift-operators: "" - name: observability-operator - namespace: openshift-operators - spec: - channel: stable - installPlanApproval: Automatic - name: observability-operator - source: community-operators - sourceNamespace: openshift-marketplace - when: - - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] - -- name: Subscribe to Elastic Cloud on Kubernetes Operator - kubernetes.core.k8s: - definition: - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: elasticsearch-eck-operator-certified - namespace: "{{ namespace }}" - spec: - channel: stable - installPlanApproval: Automatic - name: elasticsearch-eck-operator-certified - source: certified-operators - sourceNamespace: openshift-marketplace -- name: Wait for Elasticsearch CRD to appear - kubernetes.core.k8s_info: - api_version: apiextensions.k8s.io/v1 - kind: CustomResourceDefinition - name: elasticsearches.elasticsearch.k8s.elastic.co - register: eckCRD - until: eckCRD.resources[0] is defined - retries: 5 - delay: 30 + - name: Wait for Elasticsearch CRD to appear + kubernetes.core.k8s_info: + api_version: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: elasticsearches.elasticsearch.k8s.elastic.co + register: eckCRD + until: eckCRD.resources[0] is defined + retries: 5 + delay: 30 diff --git a/build/stf-run-ci/tasks/setup_stf.yml b/build/stf-run-ci/tasks/setup_stf.yml index ce4713931..c608fd58c 100644 --- a/build/stf-run-ci/tasks/setup_stf.yml +++ b/build/stf-run-ci/tasks/setup_stf.yml @@ -9,7 +9,7 @@ namespace: openshift-marketplace spec: displayName: InfraWatch Operators - image: quay.io/infrawatch-operators/infrawatch-catalog:stable-1.5 + image: quay.io/infrawatch-operators/infrawatch-catalog:nightly-1.5 publisher: InfraWatch sourceType: grpc updateStrategy: @@ -49,6 +49,14 @@ sourceNamespace: openshift-marketplace when: service_telemetry_operator_subscription_manifest is not defined +# enable catalogsource +- name: Enable InfraWatch Catalog Source + kubernetes.core.k8s: + definition: + '{{ infrawatch_catalog_source_manifest }}' + +# subscribe to the Operators from the defined CatalogSource sources. +# STO will automatically install SGO via dependencies but pre-subscribe in case deployment from different CatalogSources is specified in an override (for testing purposes). - name: Subscribe to Smart Gateway Operator kubernetes.core.k8s: definition: @@ -58,8 +66,3 @@ kubernetes.core.k8s: definition: '{{ service_telemetry_operator_subscription_manifest }}' - -- name: Enable InfraWatch Catalog Source - kubernetes.core.k8s: - definition: - '{{ infrawatch_catalog_source_manifest }}' diff --git a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml index 2a0d93436..5ffce5254 100644 --- a/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/metadata/properties.yaml @@ -19,26 +19,3 @@ properties: package: packageName: amq7-interconnect-operator versionRange: '>=1.10.0' - - type: olm.constraint - value: - failureMessage: Require certificate management for Service Telemetry Framework - all: - constraints: - - failureMessage: Package openshift-cert-manager-operator is needed for AMQ Interconnect setup - package: - packageName: openshift-cert-manager-operator - versionRange: '>=1.10.0' - - type: olm.constraint - value: - failureMessage: Require Prometheus backend for data storage of metrics for Service Telemetry Framework - any: - constraints: - - package: - packageName: prometheus - versionRange: '>=0.56.0' - - package: - packageName: observability-operator - versionRange: '>=0.0.1' - - package: - packageName: cluster-observability-operator - versionRange: '>=0.0.1' From 4ffd1ab7907e280d064b60b9464cde173996dfbe Mon Sep 17 00:00:00 2001 From: migarcia Date: Wed, 7 Feb 2024 19:15:21 +0100 Subject: [PATCH 20/28] Add optional spec.replaces field to CSV for update graph compliance (#572) The way we generate our CSVs uses OLM's skipRange functionality. This is fine, but using only this leads to older versions becoming unavailable after the fact -- see the warning at [1]. By adding an optional spec.replaces to our CSV we allow update testing as well as actual production updates for downstream builds that leverage it. Populating the field requires knowledge of the latest-released bundle, so we take it from an environment variable to be provided by the builder. If this is unset we don't include the spec.replaces field at all -- leaving previous behavior unchanged. Resolves #559 Related: STF-1658 [1] https://olm.operatorframework.io/docs/concepts/olm-architecture/operator-catalog/creating-an-update-graph/#skiprange (cherry picked from commit 99221fbffe2d658161c00fae5905aebe0d989ffa) --- build/generate_bundle.sh | 9 +++++++++ ...service-telemetry-operator.clusterserviceversion.yaml | 1 + 2 files changed, 10 insertions(+) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index e507ef23e..e169f3988 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -35,6 +35,15 @@ generate_bundle() { ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 popd > /dev/null 2>&1 + # CSVs without a spec.replaces field are valid, so fall back to those if + # latest released version is unknown. + # Placeholder value is validated by operator-sdk during local bundle + # generation and so needs to conform to RFC1123. + if [[ -n "$BUNDLE_LATEST_RELEASED_VERSION" ]]; then + REPLACE_REGEX="$REPLACE_REGEX;s#---bundle-latest-released-version#${BUNDLE_LATEST_RELEASED_VERSION}#g" + else sed -i '/---bundle-latest-released-version/d' "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" + fi + sed -i -E "${REPLACE_REGEX}" "${WORKING_DIR}/manifests/${OPERATOR_NAME}.clusterserviceversion.yaml" } diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 1e1fdc092..65e5a244e 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -458,4 +458,5 @@ spec: minKubeVersion: 1.23.0 provider: name: Red Hat + replaces: service-telemetry-operator.v---bundle-latest-released-version version: 1.99.0 From ad468f2b1d33186c998beecc65366270e7a4f2f8 Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Wed, 14 Feb 2024 19:24:21 +0100 Subject: [PATCH 21/28] STF 1.5.4 release ops (#574) * Add gitleaks.toml for rh-gitleaks (#510) Add a .gitleaks.toml file to avoid the false positive leak for the example certificate when deploying for Elasticsearch. * [stf-collect-logs] Move describe build|pod from ci/ to the role (#505) * [stf-run-ci] Fix check to include bool filter (#511) Update the check to use bool filter instead of a bar var. By default, ansible parses vars as strings, and without the | bool filter, this check is invalid, as it will always resolve to true, since it is a non-empty string. Other instances of the same check did this, but this one was missed. * [allow_skip_clone] Allow skipping of the cloning stages (#512) * [allow_skip_clone] Use _dir instead of hardcoding all directories relative to base_dir This will allow configuration of the repo clone destination, so we can use pre-cloned dirs instead of explicitly cloning the dirs each time. This is essential for CI systems like zuul, that set-up the repos with particular versions/branches prior to running the test scripts. * [zuul] List the other infrawatch repos as required for the job * [zuul] Set the {sgo,sg-bridge,sg-core,prometheus-webhook-snmp}_dir vars Add in the repo dir locations where the repos should be pre-cloned by zuul * Replace base_dir with sto_dir * set sto_dir relative to base_dir is it isn't already set * [ci] use absolute dir for requirements.txt * [ci] Update sto_dir using explicit reference zuul.project.src_dir refers to the current project dir. When using the jobs in another infrawatch project, this becomes invalid. Instead, sto_dir is explicitly set using zuul.projects[].src_dir, the same way that the other repo dirs are set in vars-zuul-common --------- Co-authored-by: Chris Sibbitt * Fix qdr auth one_time_upgrade label check (#518) * Fix qdr auth one_time_upgrade label check * Fix incorrect variable naming on one_time_upgrade label check * Adjust QDR authentication password generation (#520) Adjust the passwords being generated for QDR authentication since certain characters (such as colon) will cause a failure in the parsing routine within qpid-dispatch. Updates the lookup function to only use ascii_letters and digits and increases the length to 32 characters. --------- Co-authored-by: Leif Madsen * Add docs for skip_clone (#515) * [allow_skip_clone] Add docs for clone_repos and *_dir vars * Align README table column spacing (#516) * Align README table column spacing * Update build/stf-run-ci/README.md --------- Co-authored-by: Emma Foley --------- Co-authored-by: Leif Madsen * [zuul] Add STO to required repos (#524) It appears that STO is not included explictly when running jobs from SGO [1]. This will be the case in all the other repos. This change explicitly add it, in case it's not already included by zuul. [1] https://review.rdoproject.org/zuul/build/edd8f17bfdac4360a94186b46c4cea3f * QDR Auth in smoketest (#525) * QDR Auth in smoketest * Added qdr-test as a mock of the OSP-side QDR * Connection from qdr-test -> default-interconnect is TLS+Auth * Collectors point at qdr-test instead of default-interconnect directly * Much more realistic than the existing setup * Eliminated a substitution in sensubility config * Used default QDR basic auth in Jenkinsfile * QDR Auth for infrared 17.1 script (#517) * QDR Auth for infrared 17.1 script * Fix missing substitution for AMQP_PASS in infrared script * [zuul] Define a project template for stf-crc-jobs (#514) * [allow_skip_clone] Use _dir instead of hardcoding all directories relative to base_dir This will allow configuration of the repo clone destination, so we can use pre-cloned dirs instead of explicitly cloning the dirs each time. This is essential for CI systems like zuul, that set-up the repos with particular versions/branches prior to running the test scripts. * [zuul] List the other infrawatch repos as required for the job * [zuul] Set the {sgo,sg-bridge,sg-core,prometheus-webhook-snmp}_dir vars Add in the repo dir locations where the repos should be pre-cloned by zuul * Replace base_dir with sto_dir * set sto_dir relative to base_dir is it isn't already set * [ci] use absolute dir for requirements.txt * [ci] Update sto_dir using explicit reference zuul.project.src_dir refers to the current project dir. When using the jobs in another infrawatch project, this becomes invalid. Instead, sto_dir is explicitly set using zuul.projects[].src_dir, the same way that the other repo dirs are set in vars-zuul-common * [zuul] Define a project template for stf-crc-jobs Instead of listing all the jobs for each preoject in-repo, and needing to update the list every time that a new job is added, the project template can be updated and the changes propogated to the other infrawatch projects * [zuul] don't enable using the template * Revert "[zuul] don't enable using the template" This reverts commit 56e2009773d13587db890a6d6ca22d30f485c9cb. --------- Co-authored-by: Chris Sibbitt * Restart QDR after changing the password (#530) * Restart QDR after changing the password * Fixes bug reported here: https://github.com/infrawatch/service-telemetry-operator/pull/517#issuecomment-1794919985 * Avoids an extra manual step when changing password * Would affect users who upgrade from earlier STF and subsequently enable basic auth * Also users who need to change their passwords * Fixing ansible lint * Update roles/servicetelemetry/tasks/component_qdr.yml * Adjust QDR restarts to account for HA * [smoketest] Wait for qdr-test to be Running * [smoketest] Wait for QDR password upgrade * Remove zuul QDR auth override * [zuul] Add jobs to test with different versions of OCP (#432) * Add crc_ocp_bundle value to select OCP version * zuul: add log collection post-task to get crc logs * Add ocp v13 and a timeout to the job * Update README for 17.1 IR test (#533) * Update README for 17.1 IR test Update the 17.1 infrared test script README to show how to deploy a virtualized workload on the deployed overcloud infrastructure. Helps with testing by providing additional telemetry to STF required in certain dashboards. * Update tests/infrared/17.1/README.md Co-authored-by: Chris Sibbitt * Update tests/infrared/17.1/README.md --------- Co-authored-by: Chris Sibbitt * Support OCP v4.12 through v4.14 (#535) Support STF 1.5.3 starting at OpenShift version 4.12 due to incompatibility with 4.11 due to dependency requirements. Our primary target is support of OCP EUS releases. Closes: STF-1632 * [stf-collect-logs] Add ignore_errors to task (#529) The "Question the deployment" task didn't have ignore_errors: true set, so when the task fails, the play is finished. This means that we don't get to the "copy logs" task and can't see the job logs in zuul. ignore_errors is set to true to be consistent with other tasks * Mgirgisf/stf 1580/fix log commands (#526) * update stf-collect-logs tasks * Update log path * solve log bugs in stf-run-ci tasks * create log directory * Adjust Operator dependency version requirements (#538) Adjust the operator package dependency requirements to align to known required versions. Primarily reduce the version of openshift-cert-manager from 1.10 to 1.7 in order to support the tech-preview channel which was previously used. Lowering the version requirement allows for the openshift-cert-manager-operator installed previously to be used during the STF 1.5.2 to 1.5.3 update, removing the update from being blocked. Related: STF-1636 * Clean up stf-run-ci for OCP 4.12 minimum version (#539) Update the stf-run-ci base setup to no longer need testing against OCP 4.10 and earlier, meaning we can rely on a single workflow for installation. Also update the deployment to use cluster-observability-operator via the redhat-operators CatalogSource for installation via use_redhat and use_hybrid strategies. * [zuul] Add job to build locally and do an index-based deployment (#495) * [zuul] Add job to build locally and do an index-based deployment * Only require Interconnect and Smart Gateway (#541) * Only require Interconnect and Smart Gateway Update the dependency management within Service Telemetry Operator to only require AMQ Interconnect and Smart Gateway Operator, which is enough to deploy STF with observabilityStrategy: none. Other Operators can be installed in order to satisfy data storage of telemetry and events. Installation of cert-manager is also required, but needs to be pre-installed similar to Cluster Observability Operator, either as a cluster-scoped operator with the tech-preview channel, or a single time on the cluster as a namespace scoped operator, which is how the stable-v1 channel installs. Documentation will be updated to adjust for this change. Related: STF-1636 * Perform CI update to match docs install changes (#542) * Perform CI update to match docs install changes Update the stf-run-ci scripting to match the documented installation procedures which landed in https://github.com/infrawatch/documentation/pull/513. These changes are also reflected in #541. * Update build/stf-run-ci/tasks/setup_base.yml Co-authored-by: Emma Foley --------- Co-authored-by: Emma Foley * Also drop cert-manager project The cert-manager project gets created with workload items when deploying the cert-manager from the cert-manager-operator project. When removing cert-manager this project is not cleaned up, so we need to delete it as well. --------- Co-authored-by: Emma Foley * [stf-run-ci] Explicitly check the validate_daployment was successful (#545) In [1], the validate_deployment step is successful, despite the deployment not being successful. This causes the job to timeout because the following steps continue to run despite an invalid state. To get the expected behaviour, the output should be checked for a string indicating success. i.e. * [info] CI Build complete. You can now run tests. [2] shows the output for a successful run. [1] https://review.rdoproject.org/zuul/build/245ae63e41884dc09353d938ec9058d7/console#5/0/144/controller [2] https://review.rdoproject.org/zuul/build/802432b23da24649b818985b7b1633bb/console#5/0/82/controller * Implement dashboard management (#548) * Implement dashboard management Implement a new configuration option graphing.grafana.dashboards.enabled which results in dashboards objects being created for the Grafana Operator. Previously loading dashboards would be done manually via 'oc apply' using instructions from documentation. The new CRD parameters to the ServiceTelemetry object allows the Service Telemetry Operator to now make the GrafanaDashboard objects directly. Related: OSPRH-825 * Drop unnecessary cluster roles * Update CSV for owned parameter * Remove basic-auth method from grafana (#550) * Only openshift auth will be allowed * Adjust Alertmanager SAR to be more specific * This matches recent changes in prometheus[1] and grafana[2] [1] https://github.com/infrawatch/service-telemetry-operator/pull/549/files#diff-2cf84bcf66f12393c86949ec0d3f16c473a650173d55549bb02556d23aa22bd2R46 [2] https://github.com/infrawatch/service-telemetry-operator/pull/550/files#diff-ae71801975adb4f8dd4aa5479a66ad46e46f17de40f9d147b2e09e13ce26633eR45 * Revert "Adjust Alertmanager SAR to be more specific" This reverts commit 0f94fd577617aee6a85fc4141f98ebdfc49a9f92. * Auth to prometheus using token instead of basicauth (#549) * Auth to prometheus using token instead of basicauth * Add present/absent logic to prometheus-reader resources * s/password/token in smoketest output * [zuul] Make nightly_bundles jobs non-voting (#551) --------- Co-authored-by: Emma Foley * Fix branch co-ordination in stf-run-ci (#555) I think it got broken by an oops recently[1]. Since that change, working_branch (`branch` at that point) is never used because version_branches.sgo has a default value. This breaks the branch co-ordination in Jenkins[2] and in local testing[3]. [1] https://github.com/infrawatch/service-telemetry-operator/pull/512/files#diff-c073fe1e346d08112920aa0bbc8a7453bbd3032b7a9b09ae8cbc70df4db4ea2dR19 [2] https://github.com/infrawatch/service-telemetry-operator/blob/0f94fd577617aee6a85fc4141f98ebdfc49a9f92/Jenkinsfile#L157 [3] https://github.com/infrawatch/service-telemetry-operator/blob/0f94fd577617aee6a85fc4141f98ebdfc49a9f92/README.md?plain=1#L62 * Adjust Alertmanager SAR to be more specific (#553) * This matches recent changes in prometheus[1] and grafana[2] [1] https://github.com/infrawatch/service-telemetry-operator/pull/549/files#diff-2cf84bcf66f12393c86949ec0d3f16c473a650173d55549bb02556d23aa22bd2R46 [2] https://github.com/infrawatch/service-telemetry-operator/pull/550/files#diff-ae71801975adb4f8dd4aa5479a66ad46e46f17de40f9d147b2e09e13ce26633eR45 * Add optional spec.replaces field to CSV for update graph compliance The way we generate our CSVs uses OLM's skipRange functionality. This is fine, but using only this leads to older versions becoming unavailable after the fact -- see the warning at [1]. By adding an optional spec.replaces to our CSV we allow update testing as well as actual production updates for downstream builds that leverage it. Populating the field requires knowledge of the latest-released bundle, so we take it from an environment variable to be provided by the builder. If this is unset we don't include the spec.replaces field at all -- leaving previous behavior unchanged. Resolves #559 Related: STF-1658 [1] https://olm.operatorframework.io/docs/concepts/olm-architecture/operator-catalog/creating-an-update-graph/#skiprange * Stop using ephemeral storage for testing (#547) Update the __service_telemetry_storage_persistent_storage_class to use CRC PVs Use the default value (false) for __service_telemetry_storage_ephemeral_enabled * [zuul] Use extracted CRC nodes in stf-base (#531) * [zuul] Update base job for stf-base * Add in required projects: dataplane-operator, infra-operator, openstack-operator * Remove nodeset from stf-base it overrides the nodeset set in the base job. The nodeset is going to be used to select the OCP version * [zuul] define nodesets for easy reuse * Define the nodeset * Rename the base * Select OCP version with the nodeset * [zuul] Add a login command to get initial kubeconfig file * [stf-run-ci] Add retries to pre-clean * Update galaxy requirements * [ci] Add retry to login command * [ci] Configure kubeconfig for rhol_crc role * Apply suggestions from code review * Zuul: Update how we get the initial kubeconfig (#558) * use ci-framework infra playbook * add make targets to do set-up * link the kubeconfig files * Remove pre-get_kubeconfig.yml; the script is no longer used * [ci] Add common-tasks.yml to cover the tasks that setup every play (#556) * [zuul] Update the labels used for extracted CRC * Remove non-default cifmw_rhol_crc_kubeconfig value * Implement support for Grafana Operator v5 (#561) * Implement support for Grafana Operator v5 Implement changes to support Grafana Operator v5 when the new grafana.integreatly.org CRD is available. Use the new CRDs as default when they are available. Fallover to deploying with Grafana Operator v4 when the Grafana Operator v5 CRDs are not available, thereby providing backwards compatibility to allow administrators time to migrate. Additionally, the polystat plugin has been removed from the rhos-cloud dashboard due to compatibility issues with grafana-cli usage when dynamically loading plugins. Usage of Grafana Operator v5 is also a target for disconnected support, and dynamically loading plugins in these environments is expected to be a problem. Related: OSPRH-2577 Closes: STF-1667 * Default Grafana role set to Admin In order to match the previous (Grafana Operator v4) role, set auto_assign_org_role to the Admin value. Default is Viewer. * Remove old vendored operator_sdk/util collection (#563) Remove the old 0.1.0 vendored collection operator_sdk/util from the upstream Dockerfile and repository. Instead use the default operator_sdk/util in the base image which is a newer version of 0.4.0. We only use the util collection for one call to k8s_status when ephemeral storage is enabled. The newer collection also provides a k8s_event module which could be useful in the future. Closes: STF-1683 * Add nightly_bundle jobs to periodic pipeline (#564) The nightly_bundle jobs will run once a day * Remove hard-coded Prometheus version in template (#565) Remove the hard-coded Prometheus version in the Prometheus template when using observabilityStrategy use_redhat, which uses Cluster Observability Operator to manage the Prometheus instance requests. Previously this value was hard-coded to prevent a potential rollback when moving from Community Prometheus Operator to Cluster Observability Operator. Resolves: JIRA#OSPRH-2140 * Set features.operators.openshift.io/disconnected to True (#570) STF can now be deployed in disconnected mode. This change updates the features.operators.openshift.io/disconnected annotation to reflect this. * [stf-run-ci] Update validation check for bundle URLs (#571) * [stf-run-ci] Update validation check for bundle URLs An empty string passed as the bundle URL will pass the existing test of "is defined" and "is not None" and still be invalid. The validation for the bundle URL can be done in one check per var: * If the var is undefined, it becomes "", and the check fails, because of length * If the var is None, there's an error because None does not have a length * If the var is an empty string, the check fails because of the length This simplifies the check and improves readability * Prefer Grafana 9 workload (#575) Prefer usage of Grafana 9 container image from RHCC. Grafana 7 is EOL upstream and receives no security support. Prefer use of Grafana 9 which is still supported. --------- Co-authored-by: Leif Madsen Co-authored-by: Emma Foley Co-authored-by: Chris Sibbitt Co-authored-by: Marihan Girgis <102027102+mgirgisf@users.noreply.github.com> Co-authored-by: Miguel Garcia --- .zuul.yaml | 216 ++ build/Dockerfile | 1 - build/run-ci.yaml | 9 +- build/stf-collect-logs/tasks/main.yml | 77 +- build/stf-collect-logs/vars/main.yml | 6 + build/stf-run-ci/README.md | 88 +- build/stf-run-ci/defaults/main.yml | 1 + build/stf-run-ci/tasks/clone_repos.yml | 42 +- build/stf-run-ci/tasks/create_builds.yml | 2 - build/stf-run-ci/tasks/create_catalog.yml | 73 +- build/stf-run-ci/tasks/main.yml | 26 +- build/stf-run-ci/tasks/pre-clean.yml | 2 + build/stf-run-ci/tasks/preflight_checks.yml | 7 +- build/stf-run-ci/tasks/setup_base.yml | 8 +- .../tasks/setup_stf_from_bundles.yml | 8 +- .../tasks/setup_stf_local_build.yml | 19 +- ci/common-tasks.yml | 13 + ci/deploy_stf.yml | 14 +- ci/post-collect_logs.yml | 35 +- ci/pre-2node.yml | 33 + ci/prepare.yml | 18 +- ci/test_stf.yml | 14 +- ci/vars-local_build-index_deploy.yml | 5 + ci/vars-local_build.yml | 1 - ci/vars-nightly_bundles.yml | 1 - ci/vars-zuul-common.yml | 6 + .../operator_sdk/util/.gitignore | 1 - .../operator_sdk/util/FILES.json | 89 - .../operator_sdk/util/LICENSE | 201 -- .../operator_sdk/util/MANIFEST.json | 36 - .../operator_sdk/util/README.md | 40 - .../operator_sdk/util/demo/README.md | 26 - .../operator_sdk/util/demo/playbook.yml | 15 - .../operator_sdk/util/plugins/README.md | 31 - .../util/plugins/modules/k8s_status.py | 404 --- .../util/plugins/modules/requeue_after.py | 93 - .../infra.watch_servicetelemetrys_crd.yaml | 16 +- ...fra.watch_v1beta1_servicetelemetry_cr.yaml | 7 +- .../infra.watch_servicetelemetrys_crd.yaml | 17 +- ...emetry-operator.clusterserviceversion.yaml | 11 +- deploy/role.yaml | 1 + roles/servicetelemetry/defaults/main.yml | 8 +- .../files/memcached-dashboard.json | 1513 ++++++++++++ .../files/rhos-cloud-dashboard.json | 1633 ++++++++++++ .../files/rhos-dashboard.json | 2179 +++++++++++++++++ .../files/virtual-machine-view.json | 1112 +++++++++ .../tasks/component_grafana.yml | 351 ++- .../tasks/component_prometheus.yml | 58 +- .../tasks/component_prometheus_reader.yml | 58 + roles/servicetelemetry/tasks/main.yml | 10 +- .../templates/manifest_alertmanager.j2 | 4 +- .../templates/manifest_grafana.j2 | 9 +- .../templates/manifest_grafana_ds.j2 | 5 +- .../manifest_grafana_ds_prometheus.j2 | 24 + .../templates/manifest_grafana_v5.j2 | 97 + .../templates/manifest_prometheus.j2 | 13 +- tests/infrared/17.1/README.md | 74 +- tests/smoketest/smoketest.sh | 6 +- .../smoketest_ceilometer_entrypoint.sh | 6 +- .../smoketest_collectd_entrypoint.sh | 8 +- tests/smoketest/smoketest_job.yaml.template | 8 +- 61 files changed, 7525 insertions(+), 1364 deletions(-) create mode 100644 .zuul.yaml create mode 100644 ci/common-tasks.yml create mode 100644 ci/pre-2node.yml create mode 100644 ci/vars-local_build-index_deploy.yml delete mode 100644 collections/ansible_collections/operator_sdk/util/.gitignore delete mode 100644 collections/ansible_collections/operator_sdk/util/FILES.json delete mode 100644 collections/ansible_collections/operator_sdk/util/LICENSE delete mode 100644 collections/ansible_collections/operator_sdk/util/MANIFEST.json delete mode 100644 collections/ansible_collections/operator_sdk/util/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/demo/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/demo/playbook.yml delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/README.md delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py delete mode 100644 collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py create mode 100644 roles/servicetelemetry/files/memcached-dashboard.json create mode 100644 roles/servicetelemetry/files/rhos-cloud-dashboard.json create mode 100644 roles/servicetelemetry/files/rhos-dashboard.json create mode 100644 roles/servicetelemetry/files/virtual-machine-view.json create mode 100644 roles/servicetelemetry/tasks/component_prometheus_reader.yml create mode 100644 roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 create mode 100644 roles/servicetelemetry/templates/manifest_grafana_v5.j2 diff --git a/.zuul.yaml b/.zuul.yaml new file mode 100644 index 000000000..a418c6506 --- /dev/null +++ b/.zuul.yaml @@ -0,0 +1,216 @@ +--- +- nodeset: + name: stf-crc_extracted-ocp412 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-19-0-xxl + +- nodeset: + name: stf-crc_extracted-ocp413 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-28-0-xxl + +- nodeset: + name: stf-crc_extracted-ocp414 + nodes: + - name: controller + label: cloud-centos-9-stream-tripleo-vexxhost + - name: crc + label: coreos-crc-extracted-2-30-0-xxl + +# Based on the 2-node job cookbook at https://github.com/openstack-k8s-operators/ci-framework/blob/main/docs/source/cookbooks/zuul-job-nodeset.md +- job: + name: stf-base-2node + parent: podified-multinode-edpm-deployment-crc + abstract: true + required-projects: + - name: github.com/openstack-k8s-operators/dataplane-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/infra-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/openstack-operator + override-checkout: main + - name: github.com/openstack-k8s-operators/openstack-must-gather + override-checkout: main + pre-run: + - ci/pre-2node.yml + vars: + cifmw_deploy_edpm: false + podified_validation: true + cifmw_run_tests: false + extra-vars: + crc_ci_bootstrap_networking: + networks: + default: + range: 192.168.122.0/24 + mtu: 1500 + internal-api: + vlan: 20 + range: 172.17.0.0/24 + storage: + vlan: 21 + range: 172.18.0.0/24 + tenant: + vlan: 22 + range: 172.19.0.0/24 + instances: + controller: + networks: + default: + ip: 192.168.122.11 + crc: + networks: + default: + ip: 192.168.122.10 + internal-api: + ip: 172.17.0.5 + storage: + ip: 172.18.0.5 + tenant: + ip: 172.19.0.5 + +- job: + name: stf-base + # defined in: https://review.rdoproject.org/cgit/config/tree/zuul.d/_jobs-crc.yaml + parent: stf-base-2node + abstract: true + description: | + Run the stf-run-ci role, and then test stf + roles: # adds in dependent roles i.e. put it in the role path + - zuul: github.com/openstack-k8s-operators/ci-framework + # These are the additional repos that zuul will clone + required-projects: + - name: openstack-k8s-operators/ci-framework + override-checkout: main + - name: github.com/infrawatch/service-telemetry-operator + - name: github.com/infrawatch/smart-gateway-operator + - name: github.com/infrawatch/sg-bridge + - name: github.com/infrawatch/sg-core + - name: github.com/infrawatch/prometheus-webhook-snmp + pre-run: + - ci/prepare.yml + run: + - ci/deploy_stf.yml + - ci/test_stf.yml + post-run: + - ci/post-collect_logs.yml + # The default (~30 minutes) is not enough to run through all the job stages + timeout: 3600 + +- job: + name: stf-crc-nightly_bundles + parent: stf-base + abstract: true + description: | + Deploy STF using the nightly bundles + vars: + scenario: "nightly_bundles" + +- job: + name: stf-crc-local_build + parent: stf-base + abstract: true + description: | + Build images locally and deploy STF + vars: + scenario: "local_build" + +- job: + name: stf-crc-local_build-index_deploy + parent: stf-base + abstract: true + description: | + Build STF locally and deploy from index + vars: + scenario: "local_build-index_deploy" + +- job: + name: stf-crc-ocp_412-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.12 + nodeset: stf-crc_extracted-ocp412 + +- job: + name: stf-crc-ocp_413-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.13 + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-nightly_bundles + parent: stf-crc-nightly_bundles + description: | + Deploy STF using the nightly bundles on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 + +- job: + name: stf-crc-ocp_412-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.12 + nodeset: stf-crc_extracted-ocp412 + +- job: + name: stf-crc-ocp_413-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.13 + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-local_build + parent: stf-crc-local_build + description: | + Build images locally and deploy STF on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 + +- job: + name: stf-crc-ocp_412-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.12 + nodeset: stf-crc_extracted-ocp412 + +- job: + name: stf-crc-ocp_413-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.13 + nodeset: stf-crc_extracted-ocp413 + +- job: + name: stf-crc-ocp_414-local_build-index_deploy + parent: stf-crc-local_build-index_deploy + description: | + Build STF locally and deploy from index on OCP 4.14 + nodeset: stf-crc_extracted-ocp414 + +- project-template: + name: stf-crc-jobs + description: | + STF CRC jobs that build and deploy STF + github-check: + jobs: + - stf-crc-ocp_412-local_build + - stf-crc-ocp_413-local_build + - stf-crc-ocp_414-local_build + - stf-crc-ocp_412-local_build-index_deploy + - stf-crc-ocp_413-local_build-index_deploy + - stf-crc-ocp_414-local_build-index_deploy + +- project: + name: infrawatch/service-telemetry-operator + templates: + - stf-crc-jobs + periodic: + jobs: + - stf-crc-ocp_412-nightly_bundles + - stf-crc-ocp_413-nightly_bundles + - stf-crc-ocp_414-nightly_bundles diff --git a/build/Dockerfile b/build/Dockerfile index da2b7508f..a236fb9c8 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -18,4 +18,3 @@ USER 1001 # copy in required artifacts for the operator COPY watches.yaml ${HOME}/watches.yaml COPY roles/ ${HOME}/roles/ -COPY collections/ ${HOME}/.ansible/collections/ diff --git a/build/run-ci.yaml b/build/run-ci.yaml index bfd07c3cb..a81fcd8aa 100644 --- a/build/run-ci.yaml +++ b/build/run-ci.yaml @@ -7,9 +7,14 @@ - name: Run the STF CI system import_role: name: stf-run-ci - + + - name: Create Log directory + file: + path: "{{ playbook_dir }}/working/logs" + state: directory + - name: Collect the logs import_role: name: stf-collect-logs vars: - logfile_dir: "{{ playbook_dir }}/" + logfile_dir: "{{ playbook_dir }}/working/logs/" diff --git a/build/stf-collect-logs/tasks/main.yml b/build/stf-collect-logs/tasks/main.yml index dde52761f..347d07f37 100644 --- a/build/stf-collect-logs/tasks/main.yml +++ b/build/stf-collect-logs/tasks/main.yml @@ -1,33 +1,22 @@ --- -- name: "Get builds" - ansible.builtin.shell: - cmd: | - echo "*** [INFO] Showing oc get builds" > {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - echo "*** [INFO] Showing oc get builds -oyaml" >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - oc -n {{ namespace }} get builds -oyaml >> {{ logfile_dir }}/post_oc_get_builds.log 2>&1 - cat {{ logfile_dir }}/post_oc_get_builds.log +- name: "Get resources logs [ Builds, Subscriptions, Image, Imagestreams, Pods ]" + ansible.builtin.shell: | + for resource in {{ resource_types|join(' ') }}; do + log_file="{{ logfile_dir }}/post_oc_get_$resource.log" + echo "*** [INFO] Showing oc get '$resource'" > "$log_file" 2>&1 + oc -n {{ namespace }} get "$resource" >> "$log_file" 2>&1 + echo "[INFO] oc get '$resource' -oyaml" >> "$log_file" 2>&1 + oc -n {{ namespace }} get "$resource" -oyaml >> "$log_file" 2>&1 + done + delay: 10 + retries: 3 ignore_errors: true changed_when: false -- name: "Get subscription details" - ansible.builtin.shell: - cmd: | - oc -n {{ namespace }} get subscriptions > {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_get_subscriptions.log 2>&1 - ignore_errors: true - -- name: "Get image infos" +- name: "Get Additional Information details" ansible.builtin.shell: cmd: | - echo "[INFO] oc get images" > {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get images >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestreams" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get imagestream >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - echo "[INFO] oc get imagestream -oyaml" >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - oc -n {{ namespace }} get imagestream -oyaml >> {{ logfile_dir }}/post_oc_get_images.log 2>&1 - retries: 3 - delay: 10 + oc -n {{ namespace }} describe subscription service-telemetry-operator >> {{ logfile_dir }}/post_oc_describe_subscriptions_STO.log 2>&1 ignore_errors: true - name: "Get STO info" @@ -47,15 +36,45 @@ oc -n {{ namespace }} get csv | grep service-telemetry-operator >> {{ logfile_dir }}/post_question_deployment.log 2>&1 oc -n {{ namespace }} get csv $(oc -n {{ namespace }} get csv | grep "service-telemetry-operator" | awk '{ print $1}') -oyaml >> {{ logfile_dir }}/post_question_deployment.log 2>&1 register: output + ignore_errors: true retries: 3 delay: 10 -- name: "Get pods" - ansible.builtin.command: +- name: "Describe non-completed, non-running pods" + ansible.builtin.shell: cmd: | - oc -n {{ namespace }} get pods > {{ logfile_dir }}/post_oc_get_pods.log 2>&1 - echo "Additional information" >> {{ logfile_dir }}/post_oc_get_pods.log - oc -n {{ namespace }} describe pods >> {{ logfile_dir }}/post_oc_get_pods.log 2>&1 + for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); + do + oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 + done ignore_errors: true retries: 3 delay: 10 + +- name: "Describe builds" + ansible.builtin.shell: + cmd: | + for build in $(oc -n {{ namespace }} get builds -o json | jq -r '.items[].metadata.name'); + do + oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1 + done + ignore_errors: true + retries: 3 + delay: 10 + +- name: "Get PV and PVC information details" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} get pv >> {{ logfile_dir }}/post_pv.log 2>&1 + oc -n {{ namespace }} get pvc >> {{ logfile_dir }}/post_pvc.log 2>&1 + ignore_errors: true + +- name: "Get SGO,STO and QDR logs" + ansible.builtin.shell: + cmd: | + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l name=service-telemetry-operator -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_sto.log 2>&1 + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l app=smart-gateway-operator -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_sgo.log 2>&1 + oc -n {{ namespace }} logs $(oc -n {{ namespace }} get pod -l qdr -o jsonpath='{.items[].metadata.name}') >> {{ logfile_dir }}/logs_qdr.log 2>&1 + ignore_errors: true + retries: 3 + delay: 10 \ No newline at end of file diff --git a/build/stf-collect-logs/vars/main.yml b/build/stf-collect-logs/vars/main.yml index 5197b0284..dbf668d77 100644 --- a/build/stf-collect-logs/vars/main.yml +++ b/build/stf-collect-logs/vars/main.yml @@ -1,2 +1,8 @@ --- # vars file for stf-collect-logs +resource_types: + - builds + - subscriptions + - images + - imagestream + - pods \ No newline at end of file diff --git a/build/stf-run-ci/README.md b/build/stf-run-ci/README.md index 353a8f81f..664e57e7a 100644 --- a/build/stf-run-ci/README.md +++ b/build/stf-run-ci/README.md @@ -15,47 +15,53 @@ Primarily this means a running CodeReady Container system has been provided. Not all variables are listed here, but these are the most common ones you might choose to override: -| Parameter name | Values | Default | Description | -| ------------------------------ | ------------ | --------- | ------------------------------------ | -| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | -| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | -| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | -| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | -| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | -| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | -| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | -| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | -| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | -| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | -| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | -| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | -| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | -| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | -| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | -| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | -| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | -| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | -| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | -| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | -| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | -| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | -| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | -| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | -| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | -| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | -| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | -| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | -| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | -| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | -| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | -| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | -| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | -| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | -| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | -| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | -| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | +| Parameter name | Values | Default | Description | +| ------------------------------ | ------------ | --------- | ------------------------------------ | +| `__deploy_stf` | {true,false} | true | Whether to deploy an instance of STF | +| `__local_build_enabled` | {true,false} | true | Whether to deploy STF from local built artifacts. Also see `working_branch`, `sg_branch`, `sgo_branch` | +| `__deploy_from_bundles_enabled` | {true,false} | false | Whether to deploy STF from OLM bundles (TODO: compat with `__local_build_enabled`) | +| `__deploy_from_index_enabled` | {true,false} | false | Whether to deploy STF from locally built bundles and index image. | +| `__service_telemetry_bundle_image_path` | | `quay.io/infrawatch-operators/service-telemetry-operator-bundle:nightly-head` | Image path to Service Telemetry Operator bundle | +| `__smart_gateway_bundle_image_path` | | `quay.io/infrawatch-operators/smart-gateway-operator-bundle:nightly-head` | Image path to Smart Gateway Operator bundle | +| `setup_bundle_registry_tls_ca` | {true,false} | true | Whether to setup or not a TLS CA cert for the bundle registry access | +| `setup_bundle_registry_auth` | {true,false} | true | Whether to setup or not the auth for the bundle registry access | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus Webhook SNMP git branch to checkout | +| `sgo_branch` | | master | Which Smart Gateway Operator git branch to checkout | +| `sg_core_branch` | | master | Which Smart Gateway Core git branch to checkout | +| `sg_bridge_branch` | | master | Which Smart Gateway Bridge git branch to checkout | +| `prometheus_webhook_snmp_branch` | | master | Which Prometheus webhook snmp branch to checkout | +| `sgo_repository` | | https://github.com/infrawatch/smart-gateway-operator | Which Smart Gateway Operator git repository to clone | +| `sg_core_repository` | | https://github.com/infrawatch/sg-core | Which Smart Gateway Core git repository to clone | +| `sg_bridge_repository` | | https://github.com/infrawatch/sg-bridge | Which Smart Gateway Bridge git repository to clone | +| `prometheus_webhook_snmp_repository` | | https://github.com/infrawatch/prometheus-webhook-snmp | Which Prometheus webhook snmp git repository to clone | +| `clone_repos` | {true, false} | true | Whether to clone the repos. If false, the repos will not be cloned, and the user will need to specify a value for `sto_dir`. The location of the other repos may need to be specified as well. (see relevant sections). | +| `sto_dir` | | `{{ playbook_dir }}/..` | The location of the service-telemetry-operator directory (needed to set the other repo paths) | +| `sgo_dir` | | `{{ sto_dir }}/build/working/smart-gateway-operator` | The directory to clone smart-gateway-operator into (when clone_repos == true) or the location of the the repo (when clone_repos == false) | +| `sg_core_dir` | | `{{ sto_dir }}/build/working/sg-core` | See description of sgo_dir | +| `sg_bridge_dir` | | `{{ sto_dir }}/build/working/sg-bridge` | See description of sgo_dir | +| `prometheus_webhook_snmp_dir` | | `{{ sto_dir }}/build/working/prometheus-webhook-snmp` | See description of sgo_dir | +| `__service_telemetry_events_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_events_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the ElasticSearch CA certificate (minimum duration is 1h) | +| `__service_telemetry_events_enabled` | {true,false} | true | Whether to enable events support in ServiceTelemetry | +| `__service_telemetry_high_availability_enabled` | {true,false} | false | Whether to enable high availability support in ServiceTelemetry | +| `__service_telemetry_metrics_enabled` | {true,false} | true | Whether to enable metrics support in ServiceTelemetry | +| `__service_telemetry_storage_ephemeral_enabled` | {true,false} | false | Whether to enable ephemeral storage support in ServiceTelemetry | +| `__service_telemetry_storage_persistent_storage_class` | | | Set a custom storageClass to override the default provided by OpenShift platform | +| `__service_telemetry_snmptraps_enabled` | {true,false} | true | Whether to enable snmptraps delivery via Alertmanager receiver (prometheus-webhook-snmp) | +| `__service_telemetry_snmptraps_community` | | `public` | Set the SNMP community to send traps to. Defaults to public | +| `__service_telemetry_snmptraps_target` | | `192.168.24.254` | Set the SNMP target to send traps to. Defaults to 192.168.24.254 | +| `__service_telemetry_snmptraps_retries` | | 5 | Set the SNMP retry count for traps. Defaults to 5 | +| `__service_telemetry_snmptraps_port` | | 162 | Set the SNMP target port for traps. Defaults to 162 | +| `__service_telemetry_snmptraps_timeout` | | 1 | Set the SNMP retry timeout (in seconds). Defaults to 1 | +| `__service_telemetry_alert_oid_label` | | oid | The alert label name to look for oid value. Default to oid. | +| `__service_telemetry_trap_oid_prefix` | | 1.3.6.1.4.1.50495.15 | The OID prefix for trap variable bindings. | +| `__service_telemetry_trap_default_oid` | | 1.3.6.1.4.1.50495.15.1.2.1 | The trap OID if none is found in the Prometheus alert labels. | +| `__service_telemetry_trap_default_severity` | | | The trap severity if none is found in the Prometheus alert labels. | +| `__service_telemetry_observability_strategy` | | `use_redhat` | Which observability strategy to use for deployment. Default is 'use_redhat'. Also supported are 'use_hybrid', 'use_community', and 'none' | +| `__service_telemetry_transports_qdr_auth` | {'none', 'basic'} | `none` | Which auth method to use for QDR. Can be 'none' or 'basic'. Note: 'basic' is not yet supported in smoketests. | +| `__service_telemetry_transports_certificates_endpoint_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR endpoint certificate (minimum duration is 1h) | +| `__service_telemetry_transports_certificates_ca_cert_duration` | [ParseDuration](https://golang.org/pkg/time/#ParseDuration) | 70080h | Lifetime of the QDR CA certificate (minimum duration is 1h) | +| `__internal_registry_path` | | image-registry.openshift-image-registry.svc:5000 | Path to internal registry for image path | # Example Playbook diff --git a/build/stf-run-ci/defaults/main.yml b/build/stf-run-ci/defaults/main.yml index da9834ecf..a6c5ee184 100644 --- a/build/stf-run-ci/defaults/main.yml +++ b/build/stf-run-ci/defaults/main.yml @@ -67,5 +67,6 @@ prometheus_webhook_snmp_repository: https://github.com/infrawatch/prometheus-web base_dir: '' +clone_repos: true setup_bundle_registry_auth: true setup_bundle_registry_tls_ca: true diff --git a/build/stf-run-ci/tasks/clone_repos.yml b/build/stf-run-ci/tasks/clone_repos.yml index 2bb2871bf..632170676 100644 --- a/build/stf-run-ci/tasks/clone_repos.yml +++ b/build/stf-run-ci/tasks/clone_repos.yml @@ -3,60 +3,84 @@ # NOTE: since you can't loop against blocks (and we're using them for failure # # recovery when the request branch doesn't exist) we have to define each # of these separately rather than using a loop. + +- name: Check if the {{ sgo_dir }} already exists + ansible.builtin.stat: + path: "{{ sgo_dir }}" + register: check_sgo_dir + - name: Get Smart Gateway Operator + when: not check_sgo_dir.stat.exists block: - name: Try cloning same-named branch or override branch from SGO repository ansible.builtin.git: repo: "{{ sgo_repository }}" - dest: "{{ base_dir }}/working/smart-gateway-operator" + dest: "{{ sgo_dir }}" version: "{{ sgo_branch | default(branch, true) }}" - force: true rescue: - name: "Get {{ version_branches.sgo }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/smart-gateway-operator - dest: "{{ base_dir }}/working/smart-gateway-operator" + dest: "{{ sgo_dir }}" version: "{{ version_branches.sgo }}" +- name: Check if the {{ sg_core_dir }} already exists + ansible.builtin.stat: + path: "{{ sg_core_dir }}" + register: check_sg_core_dir + - name: Get sg-core + when: not check_sg_core_dir.stat.exists block: - name: Try cloning same-named branch or override branch from sg-core repository ansible.builtin.git: repo: "{{ sg_core_repository }}" - dest: "{{ base_dir }}/working/sg-core" + dest: "{{ sg_core_dir }}" version: "{{ sg_core_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_core }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-core - dest: "{{ base_dir }}/working/sg-core" + dest: "{{ sg_core_dir }}" version: "{{ version_branches.sg_core }}" +- name: Check if the {{ sg_bridge_dir }} already exists + ansible.builtin.stat: + path: "{{ sg_bridge_dir }}" + register: check_sg_bridge_dir + - name: Get sg-bridge + when: not check_sg_bridge_dir.stat.exists block: - name: Try cloning same-named branch or override branch from sg-bridge repository ansible.builtin.git: repo: "{{ sg_bridge_repository }}" - dest: "{{ base_dir }}/working/sg-bridge" + dest: "{{ sg_bridge_dir }}" version: "{{ sg_bridge_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.sg_bridge }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/sg-bridge - dest: "{{ base_dir }}/working/sg-bridge" + dest: "{{ sg_bridge_dir }}" version: "{{ version_branches.sg_bridge }}" +- name: Check if the {{ prometheus_webhook_snmp_dir }} already exists + ansible.builtin.stat: + path: "{{ prometheus_webhook_snmp_dir }}" + register: check_prometheus_webhook_snmp_dir + - name: Get prometheus-webhook-snmp + when: not check_prometheus_webhook_snmp_dir.stat.exists block: - name: Try cloning same-named branch or override branch from prometheus-webhook-snmp repository ansible.builtin.git: repo: "{{ prometheus_webhook_snmp_repository }}" - dest: "{{ base_dir }}/working/prometheus-webhook-snmp" + dest: "{{ prometheus_webhook_snmp_dir }}" version: "{{ prometheus_webhook_snmp_branch | default(branch, true) }}" rescue: - name: "Get {{ version_branches.prometheus_webhook_snmp }} upstream branch because specified branch or repository doesn't exist" ansible.builtin.git: repo: https://github.com/infrawatch/prometheus-webhook-snmp - dest: "{{ base_dir }}/working/prometheus-webhook-snmp" + dest: "{{ prometheus_webhook_snmp_dir }}" version: "{{ version_branches.prometheus_webhook_snmp }}" diff --git a/build/stf-run-ci/tasks/create_builds.yml b/build/stf-run-ci/tasks/create_builds.yml index e54b77cb9..8d287b121 100644 --- a/build/stf-run-ci/tasks/create_builds.yml +++ b/build/stf-run-ci/tasks/create_builds.yml @@ -23,7 +23,6 @@ - name: Kill first build since it will always fail (triggered on BuildConfig creation) ansible.builtin.shell: sleep 10 ; oc delete build {{ artifact.name }}-1 -n "{{ namespace }}" - ignore_errors: true retries: 3 delay: 10 register: kill_build @@ -34,7 +33,6 @@ ansible.builtin.command: oc start-build {{ artifact.name }} -n "{{ namespace }}" --follow --wait --from-dir "{{ artifact.working_build_dir }}" register: build_results when: build_lookup.resources | length == 0 - ignore_errors: true retries: 3 delay: 10 until: build_results.rc == 0 diff --git a/build/stf-run-ci/tasks/create_catalog.yml b/build/stf-run-ci/tasks/create_catalog.yml index 6eb6b49df..feed3b56f 100644 --- a/build/stf-run-ci/tasks/create_catalog.yml +++ b/build/stf-run-ci/tasks/create_catalog.yml @@ -38,9 +38,7 @@ register: index_dockercfg_secret ignore_errors: true -# There's an error when the requested resource doesn't exist, so check the rc -- when: index_dockercfg_secret.rc != 0 - block: +- block: - name: Create config.json to import as Secret ansible.builtin.template: variable_start_string: "<<" @@ -58,9 +56,32 @@ register: ose_op_registry_is ignore_errors: true -- name: Create ImageStream for ose-operator-registry - ansible.builtin.command: oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }} --confirm +- name: Set the operator_registry_image + ansible.builtin.set_fact: + operator_registry_image: "{{ default_operator_registry_image_base }}:{{ default_operator_registry_image_tag }}" + + # --show-multiarch=true is used because you get an error (and rc!=0) when you query a multi-arch image without specifying the arch, even when the image exists +- name: "Try to get the image info for the operator registry image" + ansible.builtin.command: + cmd: oc image info --show-multiarch=true "{{ operator_registry_image }}" + ignore_errors: true + register: image_info + +- name: Test alternative operator image + ansible.builtin.set_fact: + operator_registry_image: "quay.io/openshift/origin-operator-registry:4.13" + when: image_info.rc != 0 + +- name: Create ImageStream for ose-operator-registry, if it doesn't already exist + ansible.builtin.command: + cmd: | + oc import-image -n {{ namespace }} ose-operator-registry:{{ default_operator_registry_image_tag }} --from={{ operator_registry_image }} --confirm when: ose_op_registry_is.rc != 0 + register: create_ose_is + +- name: Show the image stream + ansible.builtin.debug: + var: create_ose_is - name: Delete the existing imagestream, if it exists ansible.builtin.command: oc delete imagestream -n {{ namespace }} service-telemetry-framework-index @@ -75,8 +96,16 @@ register: stf_index_imagestream ignore_errors: true -- when: stf_index_imagestream.rc != 0 - name: Create BuildConfig for service-telemetry-framework-index +- name: Show STF index image stream + ansible.builtin.debug: + var: stf_index_imagestream + +- name: Create index.yaml base for index image + ansible.builtin.template: + src: index-yaml.j2 + dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" + +- name: Create BuildConfig for service-telemetry-framework-index kubernetes.core.k8s: definition: apiVersion: build.openshift.io/v1 @@ -102,7 +131,8 @@ dockerfile: | # The base image is expected to contain # /bin/opm (with a serve subcommand) and /bin/grpc_health_probe - FROM {{default_operator_registry_image_base}}:{{default_operator_registry_image_tag}} + + FROM {{ operator_registry_image }} COPY --chmod=666 index.yaml /configs/ @@ -121,7 +151,7 @@ dockerStrategy: from: kind: ImageStreamTag - name: "ose-operator-registry:{{default_operator_registry_image_tag}}" + name: "ose-operator-registry:{{ default_operator_registry_image_tag }}" volumes: - mounts: - destinationPath: /opt/app-root/auth @@ -134,24 +164,13 @@ type: Docker successfulBuildsHistoryLimit: 5 -- name: Get builds of service-telemetry-framework-index - kubernetes.core.k8s_info: - api_version: build.openshift.io/v1 - kind: Build - namespace: "{{ namespace }}" - label_selectors: - - "build=service-telemetry-framework-index" - register: index_builds - -- when: index_builds.resources | length == 0 - block: - - name: Create index.yaml base for index image - ansible.builtin.template: - src: index-yaml.j2 - dest: "{{ base_dir }}/working/service-telemetry-framework-index/index.yaml" - - - name: Build service-telemetry-framework-index - ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --from-dir {{ base_dir }}/working/service-telemetry-framework-index +- name: Build service-telemetry-framework-index + ansible.builtin.command: oc start-build -n "{{ namespace }}" service-telemetry-framework-index --wait --follow --from-dir {{ base_dir }}/working/service-telemetry-framework-index + register: build_result + ignore_errors: true + retries: 3 + delay: 10 + until: build_result.rc == 0 - name: Create CloudOps CatalogSource kubernetes.core.k8s: diff --git a/build/stf-run-ci/tasks/main.yml b/build/stf-run-ci/tasks/main.yml index cf2b0a880..3f4972cf8 100644 --- a/build/stf-run-ci/tasks/main.yml +++ b/build/stf-run-ci/tasks/main.yml @@ -68,6 +68,14 @@ base_dir: "{{ playbook_dir }}" when: base_dir | length == 0 +- name: Set the repo destination dirs, if not provided + ansible.builtin.set_fact: + sto_dir: "{{ sto_dir if sto_dir is defined else base_dir + '/..' }}" + sgo_dir: "{{ sgo_dir if sgo_dir is defined else base_dir + '/working/smart-gateway-operator' }}" + sg_core_dir: "{{ sg_core_dir if sg_core_dir is defined else base_dir + '/working/sg-core' }}" + sg_bridge_dir: "{{ sg_bridge_dir if sg_bridge_dir is defined else base_dir + '/working/sg-bridge'}}" + prometheus_webhook_snmp_dir: "{{ prometheus_webhook_snmp_dir if prometheus_webhook_snmp_dir is defined else base_dir + '/working/prometheus-webhook-snmp' }}" + - name: Get operator_sdk_v0 (build bundles) ansible.builtin.command: cmd: "./get_operator_sdk.sh {{ operator_sdk_v0 }}" @@ -97,6 +105,7 @@ - create_builds block: - name: Setup supporting repositories + when: clone_repos | bool ansible.builtin.include_tasks: clone_repos.yml tags: - clone @@ -104,11 +113,11 @@ - name: Create base build list ansible.builtin.set_fact: build_list: - - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ base_dir }}/../"} - - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ base_dir }}/working/smart-gateway-operator"} - - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ base_dir }}/working/sg-core"} - - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ base_dir }}/working/sg-bridge"} - - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ base_dir }}/working/prometheus-webhook-snmp"} + - {name: service-telemetry-operator, dockerfile_path: build/Dockerfile, image_reference_name: sto_image_path, working_build_dir: "{{ sto_dir }}"} + - {name: smart-gateway-operator, dockerfile_path: build/Dockerfile, image_reference_name: sgo_image_path, working_build_dir: "{{ sgo_dir }}"} + - {name: sg-core, dockerfile_path: build/Dockerfile, image_reference_name: sg_core_image_path, working_build_dir: "{{ sg_core_dir }}"} + - {name: sg-bridge, dockerfile_path: build/Dockerfile, image_reference_name: sg_bridge_image_path, working_build_dir: "{{ sg_bridge_dir }}"} + - {name: prometheus-webhook-snmp, dockerfile_path: Dockerfile, image_reference_name: prometheus_webhook_snmp_image_path, working_build_dir: "{{ prometheus_webhook_snmp_dir }}"} - ansible.builtin.debug: var: build_list @@ -206,11 +215,12 @@ - name: Validate system is operational ansible.builtin.shell: | OCP_PROJECT="{{ namespace }}" VALIDATION_SCOPE="{{ __service_telemetry_observability_strategy }}" timeout 1200 "{{ base_dir }}/validate_deployment.sh" >> {{ logfile_dir }}/validate_deployment.log 2>&1 + cat {{ logfile_dir }}/validate_deployment.log args: executable: /bin/bash register: validate_deployment + failed_when: validate_deployment.stdout_lines[-1] != "* [info] CI Build complete. You can now run tests." - name: Show the result of the validate_deployment script - ansible.builtin.shell: - cmd: | - cat {{ logfile_dir }}/validate_deployment.log + ansible.builtin.debug: + var: validate_deployment.stdout_lines[-1] diff --git a/build/stf-run-ci/tasks/pre-clean.yml b/build/stf-run-ci/tasks/pre-clean.yml index 712d188bf..87e649ddd 100644 --- a/build/stf-run-ci/tasks/pre-clean.yml +++ b/build/stf-run-ci/tasks/pre-clean.yml @@ -135,6 +135,8 @@ - openshift-cert-manager-operator - cert-manager-operator - cert-manager + retries: 3 + delay: 10 - name: Remove Elasticsearch ignore_errors: true diff --git a/build/stf-run-ci/tasks/preflight_checks.yml b/build/stf-run-ci/tasks/preflight_checks.yml index 870931789..066474572 100644 --- a/build/stf-run-ci/tasks/preflight_checks.yml +++ b/build/stf-run-ci/tasks/preflight_checks.yml @@ -19,7 +19,12 @@ ansible.builtin.command: cmd: | oc describe csv $(oc get csv | grep "service-telemetry-operator" | awk '{print $1}') > {{ logfile_dir }}/oc_get_csv_sto.log 2>&1 - cat {{ logfile_dir }} + cat {{ logfile_dir }}/oc_get_csv_sto.log + register: csv_sto + + - name: "Show service-telemetry-operator CSV information" + ansible.builtin.debug: + var: csv_sto.stdout - name: "Show service-telemetry-operator CSV information" ansible.builtin.debug: diff --git a/build/stf-run-ci/tasks/setup_base.yml b/build/stf-run-ci/tasks/setup_base.yml index 6f60de71f..bb6667184 100644 --- a/build/stf-run-ci/tasks/setup_base.yml +++ b/build/stf-run-ci/tasks/setup_base.yml @@ -38,13 +38,13 @@ metadata: labels: operators.coreos.com/observability-operator.openshift-operators: "" - name: observability-operator + name: cluster-observability-operator namespace: openshift-operators spec: - channel: stable + channel: development installPlanApproval: Automatic - name: observability-operator - source: community-operators + name: cluster-observability-operator + source: redhat-operators sourceNamespace: openshift-marketplace when: - __service_telemetry_observability_strategy in ['use_redhat', 'use_hybrid'] diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index 8439dce4a..f549fc209 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -64,7 +64,7 @@ data: cert.pem: "{{ lookup('file', 'CA.pem') | b64encode }}" -- when: setup_bundle_registry_tls_ca +- when: setup_bundle_registry_tls_ca | bool name: Patch the default service account to use our pull secret kubernetes.core.k8s_json_patch: kind: ServiceAccount @@ -90,10 +90,10 @@ - name: "Ensure that the bundle paths are set." ansible.builtin.assert: that: - - '__smart_gateway_bundle_image_path is defined and __smart_gateway_bundle_image_path != None' - - '__service_telemetry_bundle_image_path is defined and __service_telemetry_bundle_image_path != None' + - '__smart_gateway_bundle_image_path | default("") | length > 0' + - '__service_telemetry_bindle_image_path | default("") | length > 0' fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." - success_msg: "Bundle paths are defined and not None" + success_msg: "Bundle paths are defined, are not None and have a non-zero-length" - name: Deploy SGO via OLM bundle ansible.builtin.shell: diff --git a/build/stf-run-ci/tasks/setup_stf_local_build.yml b/build/stf-run-ci/tasks/setup_stf_local_build.yml index 40774223f..ffe2e7f6a 100644 --- a/build/stf-run-ci/tasks/setup_stf_local_build.yml +++ b/build/stf-run-ci/tasks/setup_stf_local_build.yml @@ -6,7 +6,7 @@ - block: - name: Generate Smart Gateway Operator CSV ansible.builtin.shell: - chdir: "{{ base_dir }}/working/smart-gateway-operator/build" + chdir: "{{ sgo_dir }}/build" cmd: | LOGFILE="{{ logfile_dir }}/sgo_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ @@ -28,7 +28,7 @@ - name: Replace namespace in SGO role binding ansible.builtin.replace: - path: "{{ base_dir }}/working/smart-gateway-operator/deploy/role_binding.yaml" + path: "{{ sgo_dir }}/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -42,7 +42,8 @@ block: - name: Load Smart Gateway Operator RBAC ansible.builtin.command: - cmd: oc apply -f {{ base_dir }}/working/smart-gateway-operator/deploy/{{ item }} -n "{{ namespace }}" + cmd: oc apply -f ./deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ sgo_dir }}" loop: - service_account.yaml - role.yaml @@ -57,7 +58,7 @@ - block: - name: Generate Service Telemetry Operator CSV ansible.builtin.shell: - chdir: "{{ base_dir }}" + chdir: "{{ sto_dir }}/build" cmd: | LOGFILE="{{ logfile_dir }}/sto_gen_bundle.log" \ OPERATOR_SDK="{{ base_dir }}/working/operator-sdk-{{ operator_sdk_v0 }}" \ @@ -76,7 +77,7 @@ - name: Replace namespace in STO role binding ansible.builtin.replace: - path: "{{ base_dir }}/../deploy/role_binding.yaml" + path: "{{ sto_dir }}/deploy/role_binding.yaml" regexp: 'placeholder' replace: '{{ namespace }}' @@ -90,8 +91,8 @@ block: - name: Load Service Telemetry Operator RBAC ansible.builtin.command: - cmd: oc apply -f ../deploy/{{ item }} -n "{{ namespace }}" - chdir: "{{ base_dir }}" + cmd: oc apply -f ./deploy/{{ item }} -n "{{ namespace }}" + chdir: "{{ sto_dir }}" loop: - service_account.yaml - role.yaml @@ -105,5 +106,5 @@ # cleanup - name: Revert local change to role_binding.yaml ansible.builtin.shell: - cmd: git checkout -- "{{ base_dir }}/../deploy/role_binding.yaml" - chdir: "{{ base_dir }}" + cmd: git checkout -- deploy/role_binding.yaml + chdir: "{{ sto_dir }}" diff --git a/ci/common-tasks.yml b/ci/common-tasks.yml new file mode 100644 index 000000000..40ff4ad1e --- /dev/null +++ b/ci/common-tasks.yml @@ -0,0 +1,13 @@ +--- +- name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' + when: sto_dir | default('') | length == 0 + +- name: "Get vars common to all jobs" + ansible.builtin.include_vars: + file: "vars-zuul-common.yml" + +- name: "Get scenario-specific vars" + ansible.builtin.include_vars: + file: "vars-{{ scenario }}.yml" diff --git a/ci/deploy_stf.yml b/ci/deploy_stf.yml index 170e8590a..b90683f6a 100644 --- a/ci/deploy_stf.yml +++ b/ci/deploy_stf.yml @@ -2,18 +2,8 @@ - name: "Deploy STF" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Log into the cluster" ansible.builtin.import_role: diff --git a/ci/post-collect_logs.yml b/ci/post-collect_logs.yml index 58552b618..c37b512df 100644 --- a/ci/post-collect_logs.yml +++ b/ci/post-collect_logs.yml @@ -14,18 +14,8 @@ name: Collect logs on the controller gather_facts: false tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Create log dir" ansible.builtin.file: @@ -42,30 +32,11 @@ ansible.builtin.import_role: name: '../build/stf-collect-logs' - - name: "Get pods and describe non-completed, non-running pods" - ansible.builtin.shell: - cmd: | - echo "*** oc get pods ***" > {{ logfile_dir }}/oc_get_pods.log 2>&1 - oc -n {{ namespace }} get pods >> {{ logfile_dir }}/oc_get_pods.log 2>&1 - - for pod in $(oc get pods | grep -v NAME | grep -v Running | awk '{ print $1 }'); - do - oc -n {{ namespace }} describe pod $pod > {{ logfile_dir }}/post_oc_describe_pod_${pod}.log 2>&1 - done - ignore_errors: true - retries: 3 - delay: 10 - - - name: "Get build details" - ansible.builtin.shell: - cmd: | - for build in $(oc -n {{ namespace }} get builds -o json| jq -r '.items[].metadata.name'); do oc -n {{ namespace }} describe build $build > {{ logfile_dir }}/post_oc_describe_build_${build}.log 2>&1; done - - name: "Copy generated logs" ansible.builtin.shell: | cp {{ ansible_env.HOME }}/*.log . args: - chdir: "{{ ansible_user_dir }}/zuul-output/logs/controller" + chdir: "{{ logfile_dir }}" changed_when: true ignore_errors: true diff --git a/ci/pre-2node.yml b/ci/pre-2node.yml new file mode 100644 index 000000000..1d44b5f40 --- /dev/null +++ b/ci/pre-2node.yml @@ -0,0 +1,33 @@ +--- +- name: "Do pre-work to get kubeconfig" + hosts: controller + vars: + ci_framework_dir: "{{ ansible_user_dir }}/{{ zuul.projects['github.com/openstack-k8s-operators/ci-framework'].src_dir }}" + environment: + PATH: "~/.crc/bin:~/.crc/bin/oc:~/bin:{{ ansible_env.PATH }}" + tasks: + - name: "Set the sto_dir if it isn't already set" + ansible.builtin.set_fact: + sto_dir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/infrawatch/service-telemetry-operator"].src_dir }}' + when: sto_dir | default('') | length == 0 + + - name: "Run bootstrap playbook" + ansible.builtin.shell: + cmd: | + ansible-playbook -e@{{ ansible_user_dir }}/ci-framework-data/artifacts/parameters/zuul-params.yml {{ ci_framework_dir }}/playbooks/01-bootstrap.yml + chdir: "{{ ci_framework_dir }}" + + - name: Run ci_framework infra playbook + ansible.builtin.shell: + cmd: | + ansible-playbook -e cifmw_use_opn=false -e cifmw_use_devscripts=false -e cifmw_basedir={{ ansible_user_dir }}/ci-framework-data/ -e cifmw_openshift_setup_skip_internal_registry_tls_verify=true playbooks/02-infra.yml + chdir: "{{ ci_framework_dir }}" + + - name: Run make targets for setup + community.general.make: + chdir: '{{ ansible_env.HOME }}/{{ zuul.projects["github.com/openstack-k8s-operators/ci-framework"].src_dir }}' + target: "{{ item }}" + with_items: + - setup_tests + - setup_molecule + diff --git a/ci/prepare.yml b/ci/prepare.yml index 7b65362d6..9557d34b1 100644 --- a/ci/prepare.yml +++ b/ci/prepare.yml @@ -2,18 +2,8 @@ - name: "Prepare the environment for running stf" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Update pip" ansible.builtin.pip: @@ -23,7 +13,7 @@ - name: "Install pre-reqs from pip" ansible.builtin.pip: - requirements: "build/stf-run-ci/requirements.txt" + requirements: "{{ sto_dir }}/build/stf-run-ci/requirements.txt" chdir: "{{ sto_dir }}" state: present @@ -33,7 +23,7 @@ name: "{{ item }}" with_items: - "kubernetes.core:2.3.2" - - "community.general:6.2.0" + - "community.general" - name: "Log into the cluster" ansible.builtin.import_role: diff --git a/ci/test_stf.yml b/ci/test_stf.yml index 7f196e860..493775a3f 100644 --- a/ci/test_stf.yml +++ b/ci/test_stf.yml @@ -2,18 +2,8 @@ - name: "Run tests to verify that STF runs as expected" hosts: controller tasks: - - name: "Set the sto_dir if it isn't already set" - ansible.builtin.set_fact: - sto_dir: '{{ ansible_env.HOME }}/{{ zuul.project.src_dir }}' - when: sto_dir | default('') | length == 0 - - - name: "Get vars common to all jobs" - ansible.builtin.include_vars: - file: "vars-zuul-common.yml" - - - name: "Get scenario-specific vars" - ansible.builtin.include_vars: - file: "vars-{{ scenario }}.yml" + - name: "Setup play vars" + ansible.builtin.include_tasks: "common-tasks.yml" - name: "Log into the cluster" ansible.builtin.import_role: diff --git a/ci/vars-local_build-index_deploy.yml b/ci/vars-local_build-index_deploy.yml new file mode 100644 index 000000000..ed9acc624 --- /dev/null +++ b/ci/vars-local_build-index_deploy.yml @@ -0,0 +1,5 @@ +--- +#ansible-playbook --extra-vars __local_build_enabled=true -e __deploy_from_index_enabled=true --extra-vars working_branch="$(git rev-parse --abbrev-ref HEAD)" --extra-vars __service_telemetry_observability_strategy=use_redhat ./run-ci.yaml +__local_build_enabled: true +__deploy_from_index_enabled: true +__service_telemetry_observability_strategy: use_redhat diff --git a/ci/vars-local_build.yml b/ci/vars-local_build.yml index 3126605a4..206e2b327 100644 --- a/ci/vars-local_build.yml +++ b/ci/vars-local_build.yml @@ -2,4 +2,3 @@ __deploy_stf: true __local_build_enabled: true __service_telemetry_snmptraps_enabled: true -__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-nightly_bundles.yml b/ci/vars-nightly_bundles.yml index ca49656f3..26572fe9c 100644 --- a/ci/vars-nightly_bundles.yml +++ b/ci/vars-nightly_bundles.yml @@ -4,4 +4,3 @@ __local_build_enabled: false __deploy_from_bundles_enabled: true -__service_telemetry_storage_ephemeral_enabled: true diff --git a/ci/vars-zuul-common.yml b/ci/vars-zuul-common.yml index 39d43a29d..de0f17613 100644 --- a/ci/vars-zuul-common.yml +++ b/ci/vars-zuul-common.yml @@ -4,3 +4,9 @@ setup_bundle_registry_tls_ca: false setup_bundle_registry_auth: false base_dir: "{{ sto_dir }}/build" logfile_dir: "{{ ansible_user_dir }}/zuul-output/logs/controller" +clone_repos: false +sgo_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/smart-gateway-operator'].src_dir }}" +sg_core_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-core'].src_dir }}" +sg_bridge_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/sg-bridge'].src_dir }}" +prometheus_webhook_snmp_dir: "{{ ansible_env.HOME }}/{{ zuul.projects['github.com/infrawatch/prometheus-webhook-snmp'].src_dir }}" +__service_telemetry_storage_persistent_storage_class: "crc-csi-hostpath-provisioner" \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/.gitignore b/collections/ansible_collections/operator_sdk/util/.gitignore deleted file mode 100644 index dd4b63d94..000000000 --- a/collections/ansible_collections/operator_sdk/util/.gitignore +++ /dev/null @@ -1 +0,0 @@ -operator_sdk-util-*.tar.gz diff --git a/collections/ansible_collections/operator_sdk/util/FILES.json b/collections/ansible_collections/operator_sdk/util/FILES.json deleted file mode 100644 index df11b5cf5..000000000 --- a/collections/ansible_collections/operator_sdk/util/FILES.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "files": [ - { - "name": ".", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": ".gitignore", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "1e87175a024a4cf4bf7b3a5fa623d046351a591d5cee549d8ca8c30ec669a013", - "format": 1 - }, - { - "name": "plugins", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "plugins/modules", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "plugins/modules/requeue_after.py", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "16da24bb189ab5b48a2071ae21449bbd4ee332787ed62c24c094acbe64e7248e", - "format": 1 - }, - { - "name": "plugins/modules/k8s_status.py", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "fabd8a42babf96433569e4ff9887c9a163d7adab68df94d4712f7d6dbb8c1030", - "format": 1 - }, - { - "name": "plugins/README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "58b5a167904c91786df167dd097ab76aed73ffba6cc746a3624c2a5bbf62ef6f", - "format": 1 - }, - { - "name": "LICENSE", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "c71d239df91726fc519c6eb72d318ec65820627232b2f796219e87dcf35d0ab4", - "format": 1 - }, - { - "name": "README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "2d8e64d77e0a8202ce2ec6dd36b30df06840e41ab9ade86b3a55908181d322b2", - "format": 1 - }, - { - "name": "demo", - "ftype": "dir", - "chksum_type": null, - "chksum_sha256": null, - "format": 1 - }, - { - "name": "demo/playbook.yml", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "fe562128e0c234462e315568b7ef38657d601976c74a87d903c31d8ddc4ff907", - "format": 1 - }, - { - "name": "demo/README.md", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "5b6b9137f90122c4fac758a5089cd01bfdc7cf007a379e21ceb3ff1f82aafd55", - "format": 1 - } - ], - "format": 1 -} \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/LICENSE b/collections/ansible_collections/operator_sdk/util/LICENSE deleted file mode 100644 index 261eeb9e9..000000000 --- a/collections/ansible_collections/operator_sdk/util/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/collections/ansible_collections/operator_sdk/util/MANIFEST.json b/collections/ansible_collections/operator_sdk/util/MANIFEST.json deleted file mode 100644 index 6a4b9836c..000000000 --- a/collections/ansible_collections/operator_sdk/util/MANIFEST.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "collection_info": { - "namespace": "operator_sdk", - "name": "util", - "version": "0.1.0", - "authors": [ - "Austin Macdonald ", - "Fabian von Feilitzsch ", - "Venkat Ramaraju " - ], - "readme": "README.md", - "tags": [ - "operator_sdk", - "kubernetes", - "k8s", - "k8s_status", - "ansible_operator" - ], - "description": "This is a collection of Ansible assets used by the Operator SDK. https://github.com/operator-framework/operator-sdk", - "license": [], - "license_file": "LICENSE", - "dependencies": {}, - "repository": "https://github.com/operator-framework/operator-sdk-ansible-util", - "documentation": "https://github.com/operator-framework/operator-sdk-ansible-util", - "homepage": "https://github.com/operator-framework/operator-sdk-ansible-util", - "issues": "https://github.com/operator-framework/operator-sdk-ansible-util/issues" - }, - "file_manifest_file": { - "name": "FILES.json", - "ftype": "file", - "chksum_type": "sha256", - "chksum_sha256": "d7240f7df82fd9bfe60801b40703ad185932cba629271a99aad3657406c81eb0", - "format": 1 - }, - "format": 1 -} \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/README.md b/collections/ansible_collections/operator_sdk/util/README.md deleted file mode 100644 index f4140831c..000000000 --- a/collections/ansible_collections/operator_sdk/util/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Ansible Collection - operator_sdk.util - -A collection of Ansible assets for use with Ansible-based operators -built with the [operator-sdk](https://github.com/operator-framework/operator-sdk/). - - https://galaxy.ansible.com/operator_sdk/util - - -## Installation - -#### From Galaxy - -``` -ansible-galaxy collection install operator_sdk.util -``` - -#### Local - -``` -ansible-galaxy collection install operator_sdk-util-0.0.1.tar.gz -p ~/.ansible/collections -``` - -## Developer Docs - -### Build and Publish Collection - -Before building the collection, edit `galaxy.yml` and update the -version. - -**Build the collection:** - -``` -$ ansible-galaxy collection build -``` - -**Publish the collection:** - -``` -ansible-galaxy collection publish operator_sdk-util-0.0.0.tar.gz --api-key=$GALAXY_API_KEY -``` diff --git a/collections/ansible_collections/operator_sdk/util/demo/README.md b/collections/ansible_collections/operator_sdk/util/demo/README.md deleted file mode 100644 index 5ccc732e5..000000000 --- a/collections/ansible_collections/operator_sdk/util/demo/README.md +++ /dev/null @@ -1,26 +0,0 @@ -ansible-playbook -i localhost demo/playbook.yml - -```yaml -$ kubectl get memcacheds example-memcached -o yaml - -apiVersion: cache.example.com/v1alpha1 -kind: Memcached - name: example-memcached - namespace: default - selfLink: /apis/cache.example.com/v1alpha1/namespaces/default/memcacheds/example-memcached - uid: 2a94ff2b-84e0-40ce-8b5e-2b7e4d2bc0e2 -status: - conditions: - - ansibleResult: - changed: 0 - completion: 2019-10-16T13:23:21.64021 - failures: 0 - ok: 3 - skipped: 0 - lastTransitionTime: "2019-10-15T13:26:58Z" - message: Awaiting next reconciliation - reason: Successful - status: "True" - type: Running - diditwork: why yes it did -``` diff --git a/collections/ansible_collections/operator_sdk/util/demo/playbook.yml b/collections/ansible_collections/operator_sdk/util/demo/playbook.yml deleted file mode 100644 index 8127ec5de..000000000 --- a/collections/ansible_collections/operator_sdk/util/demo/playbook.yml +++ /dev/null @@ -1,15 +0,0 @@ -- hosts: localhost - # Syntax option 1 - collections: - - operator_sdk.util - tasks: - - k8s_status: - # Syntax option 2 - # tasks: - # - operator_sdk.util.k8s_status: - api_version: cache.example.com/v1alpha1 - kind: Memcached - name: example-memcached - namespace: default - status: - diditwork: "yes it did" diff --git a/collections/ansible_collections/operator_sdk/util/plugins/README.md b/collections/ansible_collections/operator_sdk/util/plugins/README.md deleted file mode 100644 index 7e9e2f1fc..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Collections Plugins Directory - -This directory can be used to ship various plugins inside an Ansible collection. Each plugin is placed in a folder that -is named after the type of plugin it is in. It can also include the `module_utils` and `modules` directory that -would contain module utils and modules respectively. - -Here is an example directory of the majority of plugins currently supported by Ansible: - -``` -└── plugins - ├── action - ├── become - ├── cache - ├── callback - ├── cliconf - ├── connection - ├── filter - ├── httpapi - ├── inventory - ├── lookup - ├── module_utils - ├── modules - ├── netconf - ├── shell - ├── strategy - ├── terminal - ├── test - └── vars -``` - -A full list of plugin types can be found at [Working With Plugins](https://docs.ansible.com/ansible/devel/plugins/plugins.html). \ No newline at end of file diff --git a/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py b/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py deleted file mode 100644 index ed7827db7..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/modules/k8s_status.py +++ /dev/null @@ -1,404 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- - -from __future__ import absolute_import, division, print_function - -import re -import copy - -from ansible.module_utils.k8s.common import AUTH_ARG_SPEC, COMMON_ARG_SPEC, KubernetesAnsibleModule - -try: - from openshift.dynamic.exceptions import DynamicApiError -except ImportError as exc: - class KubernetesException(Exception): - pass - - -__metaclass__ = type - -ANSIBLE_METADATA = {'metadata_version': '1.1', - 'status': ['preview'], - 'supported_by': 'community'} - -DOCUMENTATION = ''' - -module: k8s_status - -short_description: Update the status for a Kubernetes API resource - -version_added: "2.7" - -author: "Fabian von Feilitzsch (@fabianvf)" - -description: - - Sets the status field on a Kubernetes API resource. Only should be used if you are using Ansible to - implement a controller for the resource being modified. - -options: - status: - type: dict - description: - - A object containing `key: value` pairs that will be set on the status object of the specified resource. - - One of I(status) or I(conditions) is required. - conditions: - type: list - description: - - A list of condition objects that will be set on the status.conditions field of the specified resource. - - Unless I(force) is C(true) the specified conditions will be merged with the conditions already set on the status field of the specified resource. - - Each element in the list will be validated according to the conventions specified in the - [Kubernetes API conventions document](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status). - - 'The fields supported for each condition are: - `type` (required), - `status` (required, one of "True", "False", "Unknown"), - `reason` (single CamelCase word), - `message`, - `lastHeartbeatTime` (RFC3339 datetime string), and - `lastTransitionTime` (RFC3339 datetime string).' - - One of I(status) or I(conditions) is required.' - api_version: - description: - - Use to specify the API version. Use in conjunction with I(kind), I(name), and I(namespace) to identify a - specific object. - required: yes - aliases: - - api - - version - kind: - description: - - Use to specify an object model. Use in conjunction with I(api_version), I(name), and I(namespace) to identify a - specific object. - required: yes - name: - description: - - Use to specify an object name. Use in conjunction with I(api_version), I(kind) and I(namespace) to identify a - specific object. - required: yes - namespace: - description: - - Use to specify an object namespace. Use in conjunction with I(api_version), I(kind), and I(name) - to identify a specific object. - force: - description: - - If set to C(True), the status will be set using `PUT` rather than `PATCH`, replacing the full status object. - default: false - type: bool - host: - description: - - Provide a URL for accessing the API. Can also be specified via K8S_AUTH_HOST environment variable. - api_key: - description: - - Token used to authenticate with the API. Can also be specified via K8S_AUTH_API_KEY environment variable. - kubeconfig: - description: - - Path to an instance Kubernetes config file. If not provided, and no other connection - options are provided, the openshift client will attempt to load the default - configuration file from I(~/.kube/config.json). Can also be specified via K8S_AUTH_KUBECONFIG environment - variable. - context: - description: - - The name of a context found in the config file. Can also be specified via K8S_AUTH_CONTEXT environment variable. - username: - description: - - Provide a username for authenticating with the API. Can also be specified via K8S_AUTH_USERNAME environment - variable. - password: - description: - - Provide a password for authenticating with the API. Can also be specified via K8S_AUTH_PASSWORD environment - variable. - cert_file: - description: - - Path to a certificate used to authenticate with the API. Can also be specified via K8S_AUTH_CERT_FILE environment - variable. - key_file: - description: - - Path to a key file used to authenticate with the API. Can also be specified via K8S_AUTH_KEY_FILE environment - variable. - ssl_ca_cert: - description: - - Path to a CA certificate used to authenticate with the API. Can also be specified via K8S_AUTH_SSL_CA_CERT - environment variable. - verify_ssl: - description: - - "Whether or not to verify the API server's SSL certificates. Can also be specified via K8S_AUTH_VERIFY_SSL - environment variable." - type: bool - -requirements: - - "python >= 3.7" - - "openshift >= 0.8.1" - - "PyYAML >= 3.11" -''' - -EXAMPLES = ''' -- name: Set custom status fields on TestCR - k8s_status: - api_version: apps.example.com/v1alpha1 - kind: TestCR - name: my-test - namespace: testing - status: - hello: world - custom: entries - -- name: Update the standard condition of an Ansible Operator - k8s_status: - api_version: apps.example.com/v1alpha1 - kind: TestCR - name: my-test - namespace: testing - conditions: - - type: Running - status: "True" - reason: MigrationStarted - message: "Migration from v2 to v3 has begun" - lastTransitionTime: "{{ ansible_date_time.iso8601 }}" - -- name: | - Create custom conditions. WARNING: The default Ansible Operator status management - will never overwrite custom conditions, so they will persist indefinitely. If you - want the values to change or be removed, you will need to clean them up manually. - k8s_status: - conditions: - - type: Available - status: "False" - reason: PingFailed - message: "The service did not respond to a ping" - -''' - -RETURN = ''' -result: - description: - - If a change was made, will return the patched object, otherwise returns the instance object. - returned: success - type: complex - contains: - api_version: - description: The versioned schema of this representation of an object. - returned: success - type: str - kind: - description: Represents the REST resource this object represents. - returned: success - type: str - metadata: - description: Standard object metadata. Includes name, namespace, annotations, labels, etc. - returned: success - type: complex - spec: - description: Specific attributes of the object. Will vary based on the I(api_version) and I(kind). - returned: success - type: complex - status: - description: Current status details for the object. - returned: success - type: complex -''' - - -def condition_array(conditions): - - VALID_KEYS = ['type', 'status', 'reason', 'message', 'lastHeartbeatTime', 'lastTransitionTime'] - REQUIRED = ['type', 'status'] - CAMEL_CASE = re.compile(r'^(?:[A-Z]*[a-z]*)+$') - RFC3339_datetime = re.compile(r'^\d{4}-\d\d-\d\dT\d\d:\d\d(:\d\d)?(\.\d+)?(([+-]\d\d:\d\d)|Z)$') - - def validate_condition(condition): - if not isinstance(condition, dict): - raise ValueError('`conditions` must be a list of objects') - if isinstance(condition.get('status'), bool): - condition['status'] = 'True' if condition['status'] else 'False' - - for key in condition.keys(): - if key not in VALID_KEYS: - raise ValueError('{} is not a valid field for a condition, accepted fields are {}'.format(key, VALID_KEYS)) - for key in REQUIRED: - if not condition.get(key): - raise ValueError('Condition `{}` must be set'.format(key)) - - if condition['status'] not in ['True', 'False', 'Unknown']: - raise ValueError('Condition `status` must be one of ["True", "False", "Unknown"], not {}'.format(condition['status'])) - - if condition.get('reason') and not re.match(CAMEL_CASE, condition['reason']): - raise ValueError('Condition `reason` must be a single, CamelCase word') - - for key in ['lastHeartBeatTime', 'lastTransitionTime']: - if condition.get(key) and not re.match(RFC3339_datetime, condition[key]): - raise ValueError('`{}` must be a RFC3339 compliant datetime string'.format(key)) - - return condition - - return [validate_condition(c) for c in conditions] - - -STATUS_ARG_SPEC = { - 'status': { - 'type': 'dict', - 'required': False - }, - 'conditions': { - 'type': condition_array, - 'required': False - } -} - - -def main(): - KubernetesAnsibleStatusModule().execute_module() - - -class KubernetesAnsibleStatusModule(KubernetesAnsibleModule): - - def __init__(self, *args, **kwargs): - KubernetesAnsibleModule.__init__( - self, *args, - supports_check_mode=True, - **kwargs - ) - self.kind = self.params.get('kind') - self.api_version = self.params.get('api_version') - self.name = self.params.get('name') - self.namespace = self.params.get('namespace') - self.force = self.params.get('force') - - self.status = self.params.get('status') or {} - self.conditions = self.params.get('conditions') or [] - - if self.conditions and self.status and self.status.get('conditions'): - raise ValueError("You cannot specify conditions in both the `status` and `conditions` parameters") - - if self.conditions: - self.status['conditions'] = self.conditions - - def execute_module(self): - self.client = self.get_api_client() - - resource = self.find_resource(self.kind, self.api_version, fail=True) - if 'status' not in resource.subresources: - self.fail_json(msg='Resource {}.{} does not support the status subresource'.format(resource.api_version, resource.kind)) - - try: - instance = resource.get(name=self.name, namespace=self.namespace).to_dict() - except DynamicApiError as exc: - self.fail_json(msg='Failed to retrieve requested object: {0}'.format(exc), - error=exc.summary()) - # Make sure status is at least initialized to an empty dict - instance['status'] = instance.get('status', {}) - - if self.force: - self.exit_json(**self.replace(resource, instance)) - else: - self.exit_json(**self.patch(resource, instance)) - - def replace(self, resource, instance): - if self.status == instance['status']: - return {'result': instance, 'changed': False} - instance['status'] = self.status - try: - result = resource.status.replace(body=instance).to_dict(), - except DynamicApiError as exc: - self.fail_json(msg='Failed to replace status: {}'.format(exc), error=exc.summary()) - - return { - 'result': result, - 'changed': True - } - - def clean_last_transition_time(self, status): - '''clean_last_transition_time removes lastTransitionTime attribute from each status.conditions[*] (from old conditions). - It returns copy of status with updated conditions. Copy of status is returned, because if new conditions - are subset of old conditions, then module would return conditions without lastTransitionTime. Updated status - should be used only for check in object_contains function, not for next updates, because otherwise it can create - a mess with lastTransitionTime attribute. - - If new conditions don't contain lastTransitionTime and they are different from old conditions - (e.g. they have different status), conditions are updated and kubernetes should sets lastTransitionTime - field during update. If new conditions contain lastTransitionTime, then conditions are updated. - - Parameters: - status (dict): dictionary, which contains conditions list - - Returns: - dict: copy of status with updated conditions - ''' - updated_old_status = copy.deepcopy(status) - - for item in updated_old_status.get('conditions', []): - if 'lastTransitionTime' in item: - del item['lastTransitionTime'] - - return updated_old_status - - def patch(self, resource, instance): - # Remove lastTransitionTime from status.conditions[*] and use updated_old_status only for check in object_contains function. - # Updates of conditions should be done only with original data not with updated_old_status. - updated_old_status = self.clean_last_transition_time(instance['status']) - if self.object_contains(updated_old_status, self.status): - return {'result': instance, 'changed': False} - instance['status'] = self.merge_status(instance['status'], self.status) - try: - result = resource.status.patch(body=instance, content_type='application/merge-patch+json').to_dict() - except DynamicApiError as exc: - self.fail_json(msg='Failed to replace status: {}'.format(exc), error=exc.summary()) - - return { - 'result': result, - 'changed': True - } - - def merge_status(self, old, new): - old_conditions = old.get('conditions', []) - new_conditions = new.get('conditions', []) - if not (old_conditions and new_conditions): - return new - - merged = copy.deepcopy(old_conditions) - - for condition in new_conditions: - idx = self.get_condition_idx(merged, condition['type']) - if idx is not None: - merged[idx] = condition - else: - merged.append(condition) - new['conditions'] = merged - return new - - def get_condition_idx(self, conditions, name): - for i, condition in enumerate(conditions): - if condition.get('type') == name: - return i - return None - - def object_contains(self, obj, subset): - def dict_is_subset(obj, subset): - return all([mapping.get(type(obj.get(k)), mapping['default'])(obj.get(k), v) for (k, v) in subset.items()]) - - def list_is_subset(obj, subset): - return all(item in obj for item in subset) - - def values_match(obj, subset): - return obj == subset - - mapping = { - dict: dict_is_subset, - list: list_is_subset, - tuple: list_is_subset, - 'default': values_match - } - - return dict_is_subset(obj, subset) - - @property - def argspec(self): - args = copy.deepcopy(COMMON_ARG_SPEC) - args.pop('state') - args.pop('resource_definition') - args.pop('src') - args.update(AUTH_ARG_SPEC) - args.update(STATUS_ARG_SPEC) - return args - - -if __name__ == '__main__': - main() diff --git a/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py b/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py deleted file mode 100644 index 8485a2288..000000000 --- a/collections/ansible_collections/operator_sdk/util/plugins/modules/requeue_after.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -from __future__ import absolute_import, division, print_function -from ansible.module_utils.basic import AnsibleModule -import re - -__metaclass__ = type - -ANSIBLE_METADATA = {'metadata_version': '1.1', - 'status': ['preview'], - 'supported_by': 'community'} - -DOCUMENTATION = ''' -module: requeue_after -short_description: Tells the controller to re-trigger reconciliation after the specified time -version_added: "0.1" -author: "Venkat Ramaraju (@VenkatRamaraju)" -description: - - Tells the controller to pause reconciliation and resume reconciliation after a specified amounts of time. - If the requeue_reconciliation period is set to 't', reconciliation will occur in intervals of 't'. - -options: - time: - type: str - description: - - A string containing a time period that will be set on the returned JSON object and then used to requeue - reconciliation of an event. Time can be specified in any combination of hours, minutes, and seconds. -''' - -EXAMPLES = ''' -- name: "Running the requeue_after module" - requeue_after: - time: 24h - -- name: "Running the requeue_after module" - requeue_after: - time: 30m - -- name: "Running the requeue_after module" - requeue_after: - time: 5s -''' - -RETURN = ''' -result: - description: - - If a requeue period was specified under 'time' when calling the requeue_after period from the module, - this module will return a JSON object. - returned: success - contains: - _ansible_no_log: - description: This is a boolean. If it’s True then the playbook specified no_log (in a task’s parameters or as - a play parameter). - returned: success - type: boolean - changed: - description: A boolean indicating if the task had to make changes. - returned: success - type: boolean - invocation: - description: Information on how the module was invoked. - returned: success - type: map - period: - description: A time value read in from a playbook that specifies how long the reconciliation should be - requeued after. - returned: success - type: str -''' - - -def requeue_after(): - module = AnsibleModule(argument_spec={ - 'time': {'type': 'str', 'required': True}, - }) - - if not re.match("^[hms0-9]*$", module.params['time']): - module.fail_json(msg="invalid time input") - - result = dict( - period=module.params['time'], - ) - - module.exit_json(**result) - - -def main(): - requeue_after() - - -if __name__ == '__main__': - main() diff --git a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml index 286d2c74b..37a847303 100644 --- a/deploy/crds/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/crds/infra.watch_servicetelemetrys_crd.yaml @@ -289,18 +289,18 @@ spec: description: Whether to disable the Grafana signout menu type: boolean ingressEnabled: - description: Enable ingress access to Grafana + description: Whether to enable ingress access to Grafana type: boolean - adminPassword: - description: Grafana admin password - type: string - format: password - adminUser: - description: Grafana admin user - type: string baseImage: description: Path to the base container image used to instantiate a Grafana instance type: string + dashboards: + description: Dashboard configurations for Grafana + properties: + enabled: + description: Whether to enable built-in dashboards provided by Service Telemetry Framework + type: boolean + type: object type: object type: object cloudsRemoveOnMissing: diff --git a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml index 8b4cf7142..468347c9a 100644 --- a/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml +++ b/deploy/crds/infra.watch_v1beta1_servicetelemetry_cr.yaml @@ -79,10 +79,11 @@ spec: graphing: enabled: false grafana: - ingressEnabled: false - adminPassword: secret - adminUser: root + ingressEnabled: true disableSignoutMenu: false + baseImage: registry.redhat.io/rhel8/grafana:9 + dashboards: + enabled: true transports: qdr: enabled: true diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml index f26cbc7b9..23efd2236 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/infra.watch_servicetelemetrys_crd.yaml @@ -385,22 +385,23 @@ spec: grafana: description: Grafana related configuration properties: - adminPassword: - description: Grafana admin password - format: password - type: string - adminUser: - description: Grafana admin user - type: string baseImage: description: Path to the base container image used to instantiate a Grafana instance type: string + dashboards: + description: Dashboard configurations for Grafana + properties: + enabled: + description: Whether to enable built-in dashboards provided + by Service Telemetry Framework + type: boolean + type: object disableSignoutMenu: description: Whether to disable the Grafana signout menu type: boolean ingressEnabled: - description: Enable ingress access to Grafana + description: Whether to enable ingress access to Grafana type: boolean type: object type: object diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 65e5a244e..00cec8767 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -117,10 +117,12 @@ metadata: "graphing": { "enabled": false, "grafana": { - "adminPassword": "secret", - "adminUser": "root", + "baseImage": "registry.redhat.io/rhel8/grafana:9", + "dashboards": { + "enabled": true + }, "disableSignoutMenu": false, - "ingressEnabled": false + "ingressEnabled": true } }, "highAvailability": { @@ -154,7 +156,7 @@ metadata: features.operators.openshift.io/cnf: "false" features.operators.openshift.io/cni: "false" features.operators.openshift.io/csi: "false" - features.operators.openshift.io/disconnected: "false" + features.operators.openshift.io/disconnected: "true" features.operators.openshift.io/fips-compliant: "false" features.operators.openshift.io/proxy-aware: "false" features.operators.openshift.io/tls-profiles: "false" @@ -363,6 +365,7 @@ spec: - monitoring.coreos.com - monitoring.rhobs - elasticsearch.k8s.elastic.co + - grafana.integreatly.org - integreatly.org resources: - '*' diff --git a/deploy/role.yaml b/deploy/role.yaml index 6e22854e4..cdade2ce7 100644 --- a/deploy/role.yaml +++ b/deploy/role.yaml @@ -120,6 +120,7 @@ rules: - monitoring.coreos.com - monitoring.rhobs - elasticsearch.k8s.elastic.co + - grafana.integreatly.org - integreatly.org resources: - '*' diff --git a/roles/servicetelemetry/defaults/main.yml b/roles/servicetelemetry/defaults/main.yml index 98943a6de..9969683c3 100644 --- a/roles/servicetelemetry/defaults/main.yml +++ b/roles/servicetelemetry/defaults/main.yml @@ -84,11 +84,11 @@ servicetelemetry_defaults: graphing: enabled: false grafana: - ingress_enabled: false - admin_password: secret - admin_user: root + ingress_enabled: true disable_signout_menu: false - base_image: docker.io/grafana/grafana:8.1.2 + base_image: registry.redhat.io/rhel8/grafana:9 + dashboards: + enabled: true # 'clouds' object is not partially updatable like other objects. If 'clouds' # object is defined then the default is overwritten. diff --git a/roles/servicetelemetry/files/memcached-dashboard.json b/roles/servicetelemetry/files/memcached-dashboard.json new file mode 100644 index 000000000..e68a439b9 --- /dev/null +++ b/roles/servicetelemetry/files/memcached-dashboard.json @@ -0,0 +1,1513 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Tracking dashboard for memcached service", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "iteration": 1698247048278, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "panels": [], + "title": "Availability and connections", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_connections{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Current connections", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_uptime{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_items{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Items", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_connections_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Connection rate (1m)", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_total_events_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Max connections reached", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 14, + "panels": [], + "title": "System metrics", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:89", + "alias": "/Rx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_octets_tx_total{service=~\".+-$clouds-.+\"}[1m])", + "hide": false, + "interval": "", + "legendFormat": "Tx {{ host }}", + "refId": "B" + }, + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_octets_rx_total{service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "Rx {{ host }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Transfer rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:62", + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:63", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_libpodstats_pod_memory{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_libpodstats_pod_cpu_percent{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "CPU percent", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 46, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_libpodstats_pod_cpu_time_total{plugin_instance=\"memcached\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "CPU time", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 22, + "panels": [], + "title": "Cache performance", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "(collectd_memcached_df_free{service=~\".+-$clouds-.+\"} + collectd_memcached_df_used{service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total cache available", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_df_used{service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Cache usage over time", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_df_used{service=~\".+-$clouds-.+\"} / (collectd_memcached_df_free{service=~\".+-$clouds-.+\"} + collectd_memcached_df_used{service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Cache utilization", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 33, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"get\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total gets", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"set\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total sets", + "type": "stat" + }, + { + "datasource": null, + "description": "This is a calculated metric: get_hits / cmd_get. It indicates how efficient your Memcached server is.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"hits\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Hit rate", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"misses\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Miss rate", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 32, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_command_total{type_instance=\"flush\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total flushes", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "purple", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 40, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "collectd_memcached_memcached_ops_total{type_instance=\"evictions\",service=~\".+-$clouds-.+\"}", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Total evictions", + "type": "stat" + }, + { + "datasource": null, + "description": "The flush_all command invalidates all items in the database. This operation incurs a performance penalty and shouldn’t take place in production, so check your debug scripts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_command_total{type_instance=\"flush\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Flush rate", + "type": "timeseries" + }, + { + "datasource": null, + "description": "An eviction is when an item that still has time to live is removed from the cache because a brand new item needs to be allocated.\nThe item is selected with a pseudo-LRU mechanism.\nA high number of evictions coupled with a low hit rate means your application is setting a large number of keys that are never used again.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "cps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(collectd_memcached_memcached_ops_total{type_instance=\"evictions\",service=~\".+-$clouds-.+\"}[1m])", + "interval": "", + "legendFormat": "{{ host }}", + "refId": "A" + } + ], + "title": "Eviction rate", + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "isNone": true, + "selected": true, + "text": "None", + "value": "" + }, + "datasource": null, + "definition": "label_values(collectd_memcached_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_memcached_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Memcached View", + "uid": "VHbfxjinz", + "version": 3 +} diff --git a/roles/servicetelemetry/files/rhos-cloud-dashboard.json b/roles/servicetelemetry/files/rhos-cloud-dashboard.json new file mode 100644 index 000000000..47525b043 --- /dev/null +++ b/roles/servicetelemetry/files/rhos-cloud-dashboard.json @@ -0,0 +1,1633 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3, + "iteration": 1695784064538, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": [ + "cloud-dashboards" + ], + "targetBlank": true, + "title": "Cloud Dashboards", + "tooltip": "", + "type": "dashboards", + "url": "" + } + ], + "panels": [ + { + "cacheTimeout": null, + "cards": { + "cardPadding": 0, + "cardRound": null + }, + "color": { + "cardColor": "#37872D", + "colorScale": "linear", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "max": 1, + "min": 0, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 15, + "x": 0, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 21, + "interval": "10m", + "legend": { + "show": false + }, + "links": [], + "pluginVersion": "6.5.1", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": " avg(sensubility_container_health_status{process=\"glance_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "glance_api", + "refId": "D" + }, + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"nova_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "nova_api", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"heat_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "heat_api", + "refId": "B" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"neutron_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "neutron_api", + "refId": "C" + }, + { + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"placement_api\", service=~\".+-$clouds-.+\"})", + "format": "heatmap", + "hide": false, + "interval": "", + "legendFormat": "placement_api", + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "2m", + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "middle", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 0 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($top, sum by (plugin_instance, host) (collectd_libpodstats_pod_memory{service=~\".+-$clouds-.+\"}))", + "legendFormat": "{{plugin_instance}} on {{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top $top Memory Consumers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 6 + }, + "hiddenSeries": false, + "id": 43, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($top, avg_over_time(collectd_libpodstats_pod_cpu_percent{service=~\".+-$clouds-.+\"}[10m]))", + "legendFormat": "{{plugin_instance}} on {{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Top $top CPU Consumers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:169", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:170", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 29, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"placement_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime placement_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 8 + }, + "id": 30, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"neutron_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime neutron_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 8 + }, + "id": 31, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"heat_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime heat_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 8 + }, + "id": 26, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"nova_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime nova_api", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 8 + }, + "id": 32, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "alias": "", + "bucketAggs": [ + { + "field": "startsAt", + "id": "2", + "settings": { + "interval": "auto" + }, + "type": "date_histogram" + } + ], + "exemplar": true, + "expr": "avg(sensubility_container_health_status{process=\"glance_api\", service=~\".+-$clouds-.+\"})", + "interval": "", + "legendFormat": "", + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "", + "refId": "A", + "timeField": "startsAt" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime glance_api", + "type": "stat" + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 6, + "panels": [], + "title": "Service Resource Usage", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_libpodstats_pod_cpu_percent{plugin_instance=\"horizon\", service=~\".+-$clouds-.+\"}", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Horizon CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:235", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:236", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_libpodstats_pod_memory{plugin_instance=\"horizon\", service=~\".+-$clouds-.+\"}", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Horizon Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_cpu_percent{plugin_instance=~\"nova.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:293", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:294", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_memory{plugin_instance=~\"nova.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nova Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_cpu_percent{plugin_instance=~\"ceilometer.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ceilometer CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:465", + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": null, + "show": true + }, + { + "$$hashKey": "object:466", + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 28 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (host) (collectd_libpodstats_pod_memory{plugin_instance=~\"ceilometer.*\", service=~\".+-$clouds-.+\"})", + "legendFormat": "{{host}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ceilometer Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [ + "cloud-dashboards" + ], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(collectd_cpu_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "STFPrometheus", + "definition": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"},project)", + "description": null, + "error": null, + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "projects", + "options": [], + "query": { + "query": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"},project)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "5", + "value": "5" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "top", + "options": [ + { + "selected": false, + "text": "1", + "value": "1" + }, + { + "selected": false, + "text": "2", + "value": "2" + }, + { + "selected": false, + "text": "3", + "value": "3" + }, + { + "selected": false, + "text": "4", + "value": "4" + }, + { + "selected": true, + "text": "5", + "value": "5" + }, + { + "selected": false, + "text": "6", + "value": "6" + }, + { + "selected": false, + "text": "7", + "value": "7" + }, + { + "selected": false, + "text": "8", + "value": "8" + }, + { + "selected": false, + "text": "9", + "value": "9" + }, + { + "selected": false, + "text": "10", + "value": "10" + }, + { + "selected": false, + "text": "11", + "value": "11" + }, + { + "selected": false, + "text": "12", + "value": "12" + }, + { + "selected": false, + "text": "13", + "value": "13" + }, + { + "selected": false, + "text": "14", + "value": "14" + }, + { + "selected": false, + "text": "15", + "value": "15" + }, + { + "selected": false, + "text": "16", + "value": "16" + }, + { + "selected": false, + "text": "17", + "value": "17" + }, + { + "selected": false, + "text": "18", + "value": "18" + }, + { + "selected": false, + "text": "19", + "value": "19" + }, + { + "selected": false, + "text": "20", + "value": "20" + } + ], + "query": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Cloud View", + "uid": "IHqhpjPZz", + "version": 15 +} diff --git a/roles/servicetelemetry/files/rhos-dashboard.json b/roles/servicetelemetry/files/rhos-dashboard.json new file mode 100644 index 000000000..871f02366 --- /dev/null +++ b/roles/servicetelemetry/files/rhos-dashboard.json @@ -0,0 +1,2179 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 2, + "iteration": 1695783546006, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 68, + "panels": [], + "title": "Quickview", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "Node Active" + }, + "1": { + "text": "Node Inactive" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#37872D", + "value": null + }, + { + "color": "#C4162A", + "value": 1 + }, + { + "color": "#C4162A", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 0, + "y": 1 + }, + "id": 33, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "absent({host = '$hosts', service=~\".+-$clouds-.+\"}) or label_replace(vector(0), \"host\", \"$hosts\", \"host\", \".*\")", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "Time node has been operational", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "dtdurations" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 2, + "y": 1 + }, + "id": 31, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "collectd_uptime{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "#d44a3a", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 4, + "y": 1 + }, + "id": 19, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / count(sum by (host,plugin_instance) (collectd_cpu_percent{host=\"$hosts\", service=~\".+-$clouds-.+\"}))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "#d44a3a", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 7, + "y": 1 + }, + "id": 44, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_memory{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})/ sum(collectd_memory{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{memory}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 10, + "y": 1 + }, + "id": 41, + "links": [], + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_df_df_complex{host=\"$hosts\",type_instance=\"used\", service=~\".+-$clouds-.+\"}) by (plugin_instance) / sum(collectd_df_df_complex{host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (plugin_instance)", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "File Systems", + "type": "bargauge" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 13, + "y": 1 + }, + "id": 54, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "delta" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_interface_if_errors_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}) + sum(collectd_interface_if_errors_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Interface Errors", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Load average represents the average number of running and un-interruptable processes residing in the kernel's execution queue. \n\nTypically, short term, midterm, and long term series give running averages of 1m, 5m, and 15m, respectively. ", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_load_shortterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "short term", + "refId": "A" + }, + { + "expr": "collectd_load_midterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "mid term", + "refId": "B" + }, + { + "expr": "collectd_load_longterm{host=\"$hosts\", service=~\".+-$clouds-.+\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "long term", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load Average", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "Processes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 37, + "panels": [], + "title": "Network Interfaces", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_octets_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_octets_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Data", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 7 + }, + "hiddenSeries": false, + "id": 56, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_errors_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_errors_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "errors/s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 7 + }, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_interface_if_dropped_rx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Rx {{plugin_instance}}", + "refId": "A" + }, + { + "expr": "rate(collectd_interface_if_dropped_tx_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m])", + "legendFormat": "Tx {{plugin_instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Drop Rates", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 21, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Average non-idle CPU activity of all cores on node", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / count(sum by (type_instance) (collectd_cpu_percent{type_instance!=\"idle\",host=\"$hosts\", service=~\".+-$clouds-.+\"}))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Aggr. Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Shows average time spent for each activity across all cores", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_cpu_percent{type_instance!=\"idle\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (type_instance) / count(collectd_cpu_percent{host=\"$hosts\", service=~\".+-$clouds-.+\"}) by (type_instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{type_instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Aggr. Usage by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 25, + "panels": [], + "title": "Memory", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": null, + "description": "Memory used on node", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_memory{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "total", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 9, + "x": 12, + "y": 25 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(collectd_hugepages_vmpage_number{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{hugepages}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Huge Pages", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "0.0%", + "type": 1, + "value": "null" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 25 + }, + "id": 71, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum(collectd_hugepages_vmpage_number{type_instance=\"used\",host=\"$hosts\", service=~\".+-$clouds-.+\"}) / sum(collectd_hugepages_vmpage_number{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{hugepages}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Huge Pages (%)", + "type": "gauge" + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 11, + "panels": [], + "title": "File System", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "STFPrometheus", + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 31 + }, + "id": 51, + "links": [], + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "expr": "sum by (plugin_instance) (collectd_df_df_inodes{type_instance=\"used\", host=\"$hosts\", service=~\".+-$clouds-.+\"}) / sum by (plugin_instance) (collectd_df_df_inodes{host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Inode Usage", + "type": "bargauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 18, + "x": 6, + "y": 31 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (plugin_instance) (collectd_df_df_complex{type_instance!~\"free\",host=\"$hosts\", service=~\".+-$clouds-.+\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{plugin_instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File System Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "STFPrometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 70, + "panels": [], + "title": "Disk", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": 2, + "description": "10m rolling average", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_octets_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "B" + }, + { + "expr": "sum(rate(collectd_disk_disk_octets_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": 2, + "description": "Approximate percentage of total disk bandwidth being used.\n\nWeighted I/O includes the backlog that may be accumulating.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_io_time_io_time_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[1h]))/1000", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "i/o", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_io_time_weighted_io_time_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[1h]))/1000", + "legendFormat": "weighted i/o", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "decimals": null, + "description": "", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_ops_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_ops_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operations/s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "STFPrometheus", + "description": "Average time each I/O operation took to complete. Per the collectd disk plugin docs (https://collectd.org/wiki/index.php/Plugin:Disk), this average is not very accurate.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(collectd_disk_disk_time_read_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum(rate(collectd_disk_disk_time_write_total{host=\"$hosts\", service=~\".+-$clouds-.+\"}[10m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg. I/O Operation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(collectd_cpu_percent, service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "datasource": "STFPrometheus", + "definition": "label_values(collectd_cpu_percent{service=~\".+-$clouds-.+\"}, host)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "node", + "multi": false, + "name": "hosts", + "options": [], + "query": { + "query": "label_values(collectd_cpu_percent{service=~\".+-$clouds-.+\"}, host)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Infrastructure Node View", + "uid": "1F1OJZEWz", + "version": 4 +} diff --git a/roles/servicetelemetry/files/virtual-machine-view.json b/roles/servicetelemetry/files/virtual-machine-view.json new file mode 100644 index 000000000..0d5b4a191 --- /dev/null +++ b/roles/servicetelemetry/files/virtual-machine-view.json @@ -0,0 +1,1112 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 4, + "iteration": 1695785660982, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 8, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "count((ceilometer_cpu{project=\"$project\"}))", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Virtual Machine Instances", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(collectd_virt_percent, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (avg by (resource,project) (ceilometer_cpu{project=\"$project\"}) * 0)", + "format": "time_series", + "interval": "", + "legendFormat": "{{ plugin_instance }} on {{ node }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "VM CPU %", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ns" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "rate(ceilometer_cpu{project=\"$project\"}[1m])", + "interval": "", + "legendFormat": "{{ resource }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Time for Instances", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "displayMode": "auto", + "filterable": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "displayName", + "value": "Time" + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "plugin_instance" + }, + "properties": [ + { + "id": "displayName", + "value": "Virtual Machines" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": null + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/Value/" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.align", + "value": "auto" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 3 + }, + "id": 16, + "options": { + "showHeader": true + }, + "pluginVersion": "7.5.15", + "targets": [ + { + "exemplar": true, + "expr": "(ceilometer_cpu{project=\"$project\"}) + on (resource) group_right(project) label_replace(label_replace(collectd_virt_virt_cpu_total_total, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"Node\", \"$1\", \"host\", \".+:.+:(.+)\")", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{ plugin_instance }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "transformations": [ + { + "id": "merge", + "options": { + "reducers": [] + } + }, + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "plugin_instance", + "Node" + ] + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": null, + "description": "Memory utilization of that allocated to the virtual machine.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "opacity", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "links": [], + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 6 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "8.0.4", + "targets": [ + { + "exemplar": true, + "expr": "(label_replace(label_replace(collectd_virt_memory, \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") / 1000000) / on (resource) group_left (project) ceilometer_memory_usage{project=\"$project\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ plugin_instance }} [{{ type_instance }}] on {{ node }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "VM Memory Utilization (Allocated)", + "type": "timeseries" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine disk operations rate (in operations/second)", + "fieldConfig": { + "defaults": { + "links": [], + "unit": "decbytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 6 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_ops_read_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Read {{ plugin_instance }} disk {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_ops_write_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Write {{ plugin_instance }} disk {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Disk Operations Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network dropped packet rate (in packets-per-second)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 12 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:140", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_dropped_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_dropped_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Dropped Packet Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network error rate (in packets-per-second)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 12 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:201", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_errors_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_errors_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine disk throughput rate (in bytes)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 17 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:322", + "alias": "/Write/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_octets_read_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Read {{ plugin_instance }} disk {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_disk_octets_write_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Write {{ plugin_instance }} disk {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Disk Throughput Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Virtual machine network throughput rate (in bytes)", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 17 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.15", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:383", + "alias": "/Tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_octets_rx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Rx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "label_replace(label_replace(rate(collectd_virt_if_octets_tx_total[1m]), \"resource\", \"$1\", \"host\", \".+:(.+):.+\"), \"node\", \"$1\", \"host\", \".+:.+:(.+)\") + on (resource) group_left(project) (ceilometer_cpu{project=\"$project\"} * 0)", + "interval": "", + "legendFormat": "Tx {{ plugin_instance }} interface {{ type_instance }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "VM Network Throughput Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "datasource": null, + "definition": "label_values(service)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "cloud", + "multi": false, + "name": "clouds", + "options": [], + "query": { + "query": "label_values(service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "/.+-(.+)-coll-meter/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "datasource": null, + "definition": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"}, project)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "project", + "multi": false, + "name": "project", + "options": [], + "query": { + "query": "label_values(ceilometer_cpu{service=~\".+-$clouds-.+\"}, project)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Virtual Machine View", + "uid": "JJzvn8mnz", + "version": 4 +} diff --git a/roles/servicetelemetry/tasks/component_grafana.yml b/roles/servicetelemetry/tasks/component_grafana.yml index 068507610..7bdd939ee 100644 --- a/roles/servicetelemetry/tasks/component_grafana.yml +++ b/roles/servicetelemetry/tasks/component_grafana.yml @@ -1,78 +1,9 @@ -- name: Construct oauth redirect reference - set_fact: - grafana_oauth_redir_ref: - kind: OAuthRedirectReference - apiVersion: v1 - reference: - kind: Route - name: 'grafana-route' - -- name: Check for existing grafana htpasswd secret - no_log: true - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' - register: grafana_htpasswd_secret - -- block: - - name: Parse current Grafana htpasswd salt from secret - no_log: true - set_fact: - grafana_htpasswd_salt: "{{ ((grafana_htpasswd_secret.resources[0].data.auth | b64decode).split('$')[-1])[0:22] }}" - rescue: - - name: Generate initial Grafana htpasswd bcrypt string from grafana.admin_password - no_log: true - set_fact: - init_grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt') | replace('$2b$','$2y$', 1)) }}" - - - name: Read newly generated Grafana htpasswd salt - no_log: true - set_fact: - grafana_htpasswd_salt: "{{ (init_grafana_htpasswd_bcrypt_string.split('$')[-1])[0:22] }}" - always: - - name: Generate Grafana htpasswd bcrypt string from grafana.adminPassword using salt - no_log: true - set_fact: - grafana_htpasswd_bcrypt_string: "{{ (servicetelemetry_vars.graphing.grafana.admin_password | password_hash('bcrypt', grafana_htpasswd_salt) | replace('$2b$','$2y$', 1)) }}" - - - name: Generate Grafana auth string from grafana.adminUser and grafana_htpasswd_bcrypt_string - no_log: true - set_fact: - grafana_htpasswd_auth_string: "{{ servicetelemetry_vars.graphing.grafana.admin_user }}:{{ grafana_htpasswd_bcrypt_string }}" - -- name: Create or patch htpasswd secret for grafana admin - no_log: false - k8s: - definition: - api_version: v1 - kind: Secret - metadata: - name: '{{ ansible_operator_meta.name }}-grafana-htpasswd' - namespace: '{{ ansible_operator_meta.namespace }}' - type: Opaque - stringData: - auth: '{{ grafana_htpasswd_auth_string }}' - -- name: Lookup template - debug: - msg: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" - -- name: Set default Grafana manifest - set_fact: - grafana_manifest: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" - when: grafana_manifest is not defined - -- name: Create an instance of Grafana - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ grafana_manifest }}' - +# dashboard setup first looks for Grafana Operator v5 CRDs. If existing, prefer setup with v5. +# If v5 doesn't exist, then try v4. Don't create objects for v4 if v5 CRDs exist. - when: servicetelemetry_vars.graphing.enabled block: - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + name: Get auth data for datasources to Prometheus block: - name: Retrieve configmap for OAUTH CA certs k8s_info: @@ -82,36 +13,260 @@ namespace: '{{ ansible_operator_meta.namespace }}' register: serving_certs_ca - - name: Retrieve prometheus secret + - name: Retrieve prometheus reader token k8s_info: api_version: v1 kind: Secret namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_secret + name: stf-prometheus-reader-token + register: prometheus_reader_secret - - name: Decode prometheus password + - name: Decode prometheus reader token no_log: true set_fact: - prom_basicauth_passwd: '{{ prometheus_secret.resources[0].data.password | b64decode }}' - - # Lookup existing datasources - - name: Remove legacy datasources - k8s: - api_version: integreatly.org/v1alpha1 - name: '{{ ansible_operator_meta.name }}-ds-prometheus' - kind: GrafanaDataSource - namespace: '{{ ansible_operator_meta.namespace }}' - state: absent - - # NOTE: this can fail if you enable grafana without prometheus due to missing resources referenced in the template - - name: Set datasources - set_fact: - ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" - when: ds_manifest is not defined - - - name: Create the datasources - k8s: - state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' - definition: - '{{ ds_manifest }}' + prometheus_reader_token: '{{ prometheus_reader_secret.resources[0].data.token | b64decode }}' + +#---- deploy Grafana with v5 Operator (preferred) + - when: has_grafana_integreatly_api + name: Deploying with Grafana Operator v5 + block: + - name: Construct oauth redirect reference + set_fact: + grafana_oauth_redir_ref: + kind: OAuthRedirectReference + apiVersion: v1 + reference: + kind: Route + name: '{{ ansible_operator_meta.name }}-grafana-route' + + - name: Lookup template + debug: + msg: "{{ lookup('template', './manifest_grafana_v5.j2') | from_yaml }}" + + - name: Set default Grafana manifest (Grafana Operator v5) + set_fact: + grafana_manifest: "{{ lookup('template', './manifest_grafana_v5.j2') | from_yaml }}" + when: grafana_manifest is not defined + + - name: Create an instance of Grafana (Grafana Operator v5) + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ grafana_manifest }}' + + # NOTE: we only provide events forwarding with STF. We don't use events + # in dashboards, so there is no need to create an Elasticsearch + # datasource. + - when: servicetelemetry_vars.backends.metrics.prometheus.enabled + name: Create Grafana datasource for Prometheus + block: + - name: Set datasource for Prometheus + set_fact: + ds_manifest: "{{ lookup('template', './manifest_grafana_ds_prometheus.j2') | from_yaml }}" + when: ds_manifest is not defined + + - name: Create the datasource for Prometheus + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' + + - name: Load Cloud Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-cloud-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: rhos-cloud-dashboard.json + json: | + {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} + + - name: Load Infrastructure Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: rhos-dashboard.json + json: | + {{ lookup('file', 'rhos-dashboard.json') | string }} + + - name: Load Memcached Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: memcached-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: memcached-dashboard.json + json: | + {{ lookup('file', 'memcached-dashboard.json') | string }} + + - name: Load Virtual Machine View Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: virtual-machine-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + instanceSelector: + matchLabels: + dashboards: "stf" + name: virtual-machine-view.json + json: | + {{ lookup('file', 'virtual-machine-view.json') | string }} + +#---- deploy Grafana with v4 Operator if v5 CRDs are not available (legacy deployments) + - when: has_integreatly_api and not has_grafana_integreatly_api + name: Deploying with Grafana Operator v4 + block: + - name: Construct oauth redirect reference + set_fact: + grafana_oauth_redir_ref: + kind: OAuthRedirectReference + apiVersion: v1 + reference: + kind: Route + name: 'grafana-route' + + - name: Lookup template + debug: + msg: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" + + - name: Set default Grafana manifest (Grafana Operator v4) + set_fact: + grafana_manifest: "{{ lookup('template', './manifest_grafana.j2') | from_yaml }}" + when: grafana_manifest is not defined + + - name: Create an instance of Grafana (Grafana Operator v4) + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ grafana_manifest }}' + + - name: Remove legacy datasources + k8s: + api_version: integreatly.org/v1alpha1 + name: '{{ ansible_operator_meta.name }}-ds-prometheus' + kind: GrafanaDataSource + namespace: '{{ ansible_operator_meta.namespace }}' + state: absent + + # NOTE: This can fail if you enable grafana without prometheus due + # to missing resources referenced in the template. The v1alpha1 CRD + # of GrafanaDatasources uses a list, so logic would need to be + # added to the template directly checking for parameters set in + # ServiceTelemetry. + - name: Set datasources + set_fact: + ds_manifest: "{{ lookup('template', './manifest_grafana_ds.j2') | from_yaml }}" + when: ds_manifest is not defined + + - name: Create the datasources + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.enabled else "absent" }}' + definition: + '{{ ds_manifest }}' + + - name: Load Cloud Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-cloud-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-cloud-dashboard.json + json: | + {{ lookup('file', 'rhos-cloud-dashboard.json') | string }} + + - name: Load Infrastructure Overview Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: rhos-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: rhos-dashboard.json + json: | + {{ lookup('file', 'rhos-dashboard.json') | string }} + + - name: Load Memcached Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: memcached-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: memcached-dashboard.json + json: | + {{ lookup('file', 'memcached-dashboard.json') | string }} + + - name: Load Virtual Machine View Dashboard + k8s: + state: '{{ "present" if servicetelemetry_vars.graphing.grafana.dashboards.enabled else "absent" }}' + definition: + apiVersion: integreatly.org/v1alpha1 + kind: GrafanaDashboard + metadata: + labels: + app: grafana + stf_owner: "{{ ansible_operator_meta.name }}" + name: virtual-machine-dashboard-1 + namespace: "{{ ansible_operator_meta.namespace }}" + spec: + name: virtual-machine-view.json + json: | + {{ lookup('file', 'virtual-machine-view.json') | string }} diff --git a/roles/servicetelemetry/tasks/component_prometheus.yml b/roles/servicetelemetry/tasks/component_prometheus.yml index 2e865abd2..601a28247 100644 --- a/roles/servicetelemetry/tasks/component_prometheus.yml +++ b/roles/servicetelemetry/tasks/component_prometheus.yml @@ -42,12 +42,6 @@ - subjectaccessreviews verbs: - create - - apiGroups: - - "" - resources: - - namespaces - verbs: - - get - name: Setup ClusterRoleBinding for Prometheus block: @@ -123,6 +117,18 @@ - securitycontextconstraints verbs: - use + - apiGroups: + - '{{ prometheus_operator_api_string | replace("/v1","") }}' + resources: + - alertmanagers + verbs: + - get + - apiGroups: + - smartgateway.infra.watch + resources: + - smartgateways + verbs: + - get - name: Setup RoleBinding for Prometheus block: @@ -173,46 +179,6 @@ name: prometheus-k8s-{{ ansible_operator_meta.namespace }} namespace: '{{ ansible_operator_meta.namespace }}' -- name: Check for existing prometheus htpasswd user secret - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_htpasswd - -- name: Create a new prometheus password if it doesn't exist yet - when: prometheus_htpasswd.resources|length == 0 - block: - - name: Set prometheus htpasswd - no_log: true - set_fact: - prom_basicauth_passwd: "{{ lookup('password', '/dev/null') }}" - - - name: Create htpasswd secret # Contains both the htpasswd version and plaintext for lookup - no_log: true - k8s: - definition: - api_version: v1 - kind: Secret - metadata: - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - namespace: '{{ ansible_operator_meta.namespace }}' - type: Opaque - stringData: - auth: 'internal:{{ prom_basicauth_passwd | password_hash("bcrypt") | replace("$2b$","$2y$", 1)}}' - password: '{{ prom_basicauth_passwd }}' - tags: - - skip_ansible_lint - - - name: Re-register new object for use in the annotation - k8s_info: - api_version: v1 - kind: Secret - namespace: '{{ ansible_operator_meta.namespace }}' - name: '{{ ansible_operator_meta.name }}-prometheus-htpasswd' - register: prometheus_htpasswd - - name: Lookup template debug: msg: "{{ lookup('template', './manifest_prometheus.j2') | from_yaml }}" diff --git a/roles/servicetelemetry/tasks/component_prometheus_reader.yml b/roles/servicetelemetry/tasks/component_prometheus_reader.yml new file mode 100644 index 000000000..6cbee8b42 --- /dev/null +++ b/roles/servicetelemetry/tasks/component_prometheus_reader.yml @@ -0,0 +1,58 @@ +- name: Create ServiceAccount/stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: v1 + kind: ServiceAccount + metadata: + name: stf-prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + +- name: Create prometheus-reader Role + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + rules: + - apiGroups: + - '{{ prometheus_operator_api_string | replace("/v1","") }}' + resources: + - prometheus + verbs: + - get + namespaces: + - '{{ ansible_operator_meta.namespace }}' + +- name: Create prometheus-reader RoleBinding for stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: stf-prometheus-reader + namespace: '{{ ansible_operator_meta.namespace }}' + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-reader + subjects: + - kind: ServiceAccount + name: stf-prometheus-reader + +- name: Create an access token for stf-prometheus-reader + k8s: + state: '{{ "present" if servicetelemetry_vars.backends.metrics.prometheus.enabled else "absent" }}' + definition: + apiVersion: v1 + kind: Secret + metadata: + name: stf-prometheus-reader-token + namespace: '{{ ansible_operator_meta.namespace }}' + annotations: + kubernetes.io/service-account.name: stf-prometheus-reader + type: kubernetes.io/service-account-token diff --git a/roles/servicetelemetry/tasks/main.yml b/roles/servicetelemetry/tasks/main.yml index bc33df647..3bef4a10f 100644 --- a/roles/servicetelemetry/tasks/main.yml +++ b/roles/servicetelemetry/tasks/main.yml @@ -47,6 +47,8 @@ - block: - name: Create Prometheus instance include_tasks: component_prometheus.yml + - name: Create Prometheus read-only user + include_tasks: component_prometheus_reader.yml # --> alerting - name: Create Alertmanager instance @@ -85,12 +87,16 @@ loop_var: this_cloud # --> graphing -- name: Check if we have integreatly.org API +- name: Check if we have integreatly.org API (Grafana Operator v4) set_fact: has_integreatly_api: "{{ True if 'integreatly.org' in api_groups else False }}" +- name: Check if we have grafana.integreatly.org API (Grafana Operator v5) + set_fact: + has_grafana_integreatly_api: "{{ True if 'grafana.integreatly.org' in api_groups else False }}" + - when: - - has_integreatly_api | bool + - (has_integreatly_api | bool) or (has_grafana_integreatly_api | bool) name: Start graphing component plays include_tasks: component_grafana.yml diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 2465ee43f..5b53cc592 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -26,8 +26,8 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' - - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get"}}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9095 name: https diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index 792f7065c..8b176b103 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -7,9 +7,6 @@ spec: serviceAccount: annotations: serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' - deployment: - annotations: - hash-of-creds-to-force-restart-if-changed: {{ grafana_htpasswd_auth_string | b64encode }} baseImage: {{ servicetelemetry_vars.graphing.grafana.base_image }} ingress: enabled: {{ servicetelemetry_vars.graphing.grafana.ingress_enabled }} @@ -40,13 +37,12 @@ spec: - -provider=openshift - -pass-basic-auth=false - -https-address=:3002 - - -htpasswd-file=/etc/proxy/htpasswd/auth - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -upstream=http://localhost:3000 - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=grafana-serviceaccount - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"integreatly.org", "verb":"get"}' - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt ports: @@ -58,12 +54,9 @@ spec: name: secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls - mountPath: /etc/proxy/secrets name: secret-{{ ansible_operator_meta.name }}-session-secret - - mountPath: /etc/proxy/htpasswd - name: secret-{{ ansible_operator_meta.name }}-grafana-htpasswd secrets: - '{{ ansible_operator_meta.name }}-grafana-proxy-tls' - '{{ ansible_operator_meta.name }}-session-secret' - - '{{ ansible_operator_meta.name }}-grafana-htpasswd' service: ports: - name: web diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 index d0f0478d1..a453b311a 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_ds.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_ds.j2 @@ -12,14 +12,13 @@ spec: jsonData: timeInterval: 5s tlsAuthWithCACert: true + httpHeaderName1: 'Authorization' name: STFPrometheus type: prometheus url: 'https://{{ ansible_operator_meta.name }}-prometheus-proxy.{{ ansible_operator_meta.namespace }}.svc:9092' version: 1 - basicAuth: true - basicAuthUser: internal secureJsonData: - basicAuthPassword: '{{ prom_basicauth_passwd }}' + httpHeaderValue1: 'Bearer {{prometheus_reader_token}}' tlsCACert: | {{ serving_certs_ca.resources[0].data['service-ca.crt'] | indent(10) }} {% endif %} diff --git a/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 b/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 new file mode 100644 index 000000000..473389cf8 --- /dev/null +++ b/roles/servicetelemetry/templates/manifest_grafana_ds_prometheus.j2 @@ -0,0 +1,24 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: {{ ansible_operator_meta.name }}-ds-stf-prometheus + namespace: {{ ansible_operator_meta.namespace }} +spec: + instanceSelector: + matchLabels: + dashboards: "stf" + datasource: + name: STFPrometheus + type: prometheus + access: proxy + url: 'https://{{ ansible_operator_meta.name }}-prometheus-proxy.{{ ansible_operator_meta.namespace }}.svc:9092' + isDefault: true + editable: true + jsonData: + 'timeInterval': "5s" + 'tlsAuthWithCACert': true + 'httpHeaderName1': 'Authorization' + secureJsonData: + 'httpHeaderValue1': 'Bearer {{prometheus_reader_token}}' + 'tlsCACert': | + {{ serving_certs_ca.resources[0].data['service-ca.crt'] | indent(8) }} diff --git a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 new file mode 100644 index 000000000..278e452ff --- /dev/null +++ b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 @@ -0,0 +1,97 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: {{ ansible_operator_meta.name }}-grafana + namespace: {{ ansible_operator_meta.namespace }} + labels: + dashboards: "stf" +spec: + serviceAccount: + metadata: + annotations: + serviceaccounts.openshift.io/oauth-redirectreference.primary: '{{ grafana_oauth_redir_ref | to_json }}' +{% if servicetelemetry_vars.graphing.grafana.ingress_enabled is defined and servicetelemetry_vars.graphing.grafana.ingress_enabled %} + route: + spec: + port: + targetPort: web + tls: + termination: reencrypt + to: + kind: Service + name: {{ ansible_operator_meta.name }}-grafana-service + weight: 100 + wildcardPolicy: None +{% endif %} + client: + preferIngress: false + config: + auth: + disable_signout_menu: "{{ servicetelemetry_vars.graphing.grafana.disable_signout_menu }}" + disable_login_form: "True" + auth.anonymous: + enabled: "True" + auth.proxy: + enabled: "True" + enable_login_token: "True" + header_property: "username" + header_name: "X-Forwarded-User" + log: + level: warn + mode: "console" + users: + auto_assign_org_role: Admin + deployment: + spec: + template: + spec: + volumes: + - name: 'secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls' + secret: + secretName: '{{ ansible_operator_meta.name }}-grafana-proxy-tls' + - name: 'secret-{{ ansible_operator_meta.name }}-session-secret' + secret: + secretName: '{{ ansible_operator_meta.name }}-session-secret' + containers: + - name: oauth-proxy + image: {{ oauth_proxy_image }} + args: + - '-provider=openshift' + - '-pass-basic-auth=false' + - '-https-address=:3002' + - '-http-address=' + - '-email-domain=*' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"grafana.integreatly.org", "verb":"get"}' + - '-upstream=http://localhost:3000' + - '-tls-cert=/etc/tls/private/tls.crt' + - '-tls-key=/etc/tls/private/tls.key' + - '-client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token' + - '-cookie-secret-file=/etc/proxy/secrets/session_secret' + - '-openshift-service-account={{ ansible_operator_meta.name }}-grafana-sa' + - '-openshift-ca=/etc/pki/tls/cert.pem' + - '-openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + - '-skip-auth-regex=^/metrics' + ports: + - containerPort: 3002 + name: https + protocol: TCP + resources: { } + volumeMounts: + - mountPath: /etc/tls/private + name: secret-{{ ansible_operator_meta.name }}-grafana-proxy-tls + - mountPath: /etc/proxy/secrets + name: secret-{{ ansible_operator_meta.name }}-session-secret +{% if servicetelemetry_vars.graphing.grafana.base_image is defined %} + - name: grafana + image: {{ servicetelemetry_vars.graphing.grafana.base_image }} +{% endif %} + service: + metadata: + annotations: + service.alpha.openshift.io/serving-cert-secret-name: {{ ansible_operator_meta.name }}-grafana-proxy-tls + spec: + ports: + - name: web + port: 3002 + protocol: TCP + targetPort: https diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 2bdf408b9..66f2d5a8d 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -7,7 +7,11 @@ metadata: name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: +{% if observability_strategy != "use_community" %} + version: null +{% else %} version: v2.43.0 +{% endif %} replicas: {{ servicetelemetry_vars.backends.metrics.prometheus.deployment_size }} ruleSelector: {} securityContext: {} @@ -20,7 +24,6 @@ spec: labels: prometheus: '{{ ansible_operator_meta.name }}' annotations: - hash-of-entire-htpasswd-secret-to-force-restart-if-changed: {{ prometheus_htpasswd | sha1 }} {% if servicetelemetry_vars.alerting.enabled %} alerting: alertmanagers: @@ -42,10 +45,11 @@ spec: - -tls-cert=/etc/tls/private/tls.crt - -tls-key=/etc/tls/private/tls.key - -upstream=http://localhost:9090/ - - -htpasswd-file=/etc/proxy/htpasswd/auth - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=prometheus-stf - - '-openshift-sar={"resource": "namespaces", "verb": "get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' + ports: - containerPort: 9092 name: https @@ -55,14 +59,11 @@ spec: name: secret-{{ ansible_operator_meta.name }}-prometheus-proxy-tls - mountPath: /etc/proxy/secrets name: secret-{{ ansible_operator_meta.name }}-session-secret - - mountPath: /etc/proxy/htpasswd - name: secret-{{ ansible_operator_meta.name }}-prometheus-htpasswd configMaps: - serving-certs-ca-bundle secrets: - '{{ ansible_operator_meta.name }}-prometheus-proxy-tls' - '{{ ansible_operator_meta.name }}-session-secret' - - '{{ ansible_operator_meta.name }}-prometheus-htpasswd' {% if servicetelemetry_vars.backends.metrics.prometheus.storage.strategy == "persistent" %} storage: volumeClaimTemplate: diff --git a/tests/infrared/17.1/README.md b/tests/infrared/17.1/README.md index 0db5c0734..b1ab53ac3 100644 --- a/tests/infrared/17.1/README.md +++ b/tests/infrared/17.1/README.md @@ -2,16 +2,64 @@ ## Basic deployment -```bash -OCP_ROUTE_IP="10.0.100.50" \ -CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ -AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ -AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ -ENABLE_STF_CONNECTORS=true \ -ENABLE_GNOCCHI_CONNECTORS=false \ -CONTROLLER_MEMORY="24000" \ -COMPUTE_CPU="6" \ -COMPUTE_MEMORY="24000" \ -LIBVIRT_DISKPOOL="/home/libvirt/images" \ -./infrared-openstack.sh -``` +A basic deployment can be deployed and connected to an existing STF deployment automatically after logging into the OpenShift cluster hosting STF from the host system. + +### Prequisites + +* Logged into the host system where you'll deploy the virtualized OpenStack infrastructure. +* Installed infrared and cloned the infrawatch/service-telemetry-operator repository. +* DNS resolution (or `/etc/hosts` entry) of the OpenShift cluster API endpoint. +* Downloaded the `oc` binary, made it executable, and placed in $PATH. +* Logged into the OpenShift hosting STF and changed to the `service-telemetry` project from the host system. + +### Procedure + +* Deploy the overcloud using the infrawatch-openstack.sh script: + ```bash + OCP_ROUTE_IP="10.0.111.41" \ + CA_CERT_FILE_CONTENT="$(oc get secret/default-interconnect-selfsigned -o jsonpath='{.data.ca\.crt}' | base64 -d)" \ + AMQP_HOST="$(oc get route default-interconnect-5671 -ojsonpath='{.spec.host}')" \ + AMQP_PASS="$(oc get secret default-interconnect-users -o json | jq -r .data.guest | base64 -d)" \ + ENABLE_STF_CONNECTORS=true \ + ENABLE_GNOCCHI_CONNECTORS=false \ + CONTROLLER_MEMORY="24000" \ + COMPUTE_CPU="6" \ + COMPUTE_MEMORY="24000" \ + LIBVIRT_DISKPOOL="/home/libvirt/images" \ + ./infrared-openstack.sh + ``` + +## Running a test workload + +You can run a test workload on the deployed overcloud by logging into the undercloud and completing some additional setup to allow for virtual machine workloads to run. + +### Procedure + +* Login to the undercloud from the host system: + ```bash + ir ssh undercloud-0 + ``` +* Complete the deployment of a private network, router, and other aspects to allow the virtual machine to be deployed: + ```bash + source overcloudrc + export PRIVATE_NETWORK_CIDR=192.168.100.0/24 + openstack flavor create --ram 512 --disk 1 --vcpu 1 --public tiny + curl -L -O https://download.cirros-cloud.net/0.5.0/cirros-0.5.0-x86_64-disk.img + openstack image create cirros --container-format bare --disk-format qcow2 --public --file cirros-0.5.0-x86_64-disk.img + openstack keypair create --public-key ~/.ssh/id_rsa.pub default + openstack security group create basic + openstack security group rule create basic --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 + openstack security group rule create --protocol icmp basic + openstack security group rule create --protocol udp --dst-port 53:53 basic + openstack network create --internal private + openstack subnet create private-net \ + --subnet-range $PRIVATE_NETWORK_CIDR \ + --network private + openstack router create vrouter + openstack router set vrouter --external-gateway public + openstack router add subnet vrouter private-net + openstack server create --flavor tiny --image cirros --key-name default --security-group basic --network private myserver + until [ "$(openstack server list --name myserver --column Status --format value)" = "ACTIVE" ]; do echo "Waiting for server to be ACTIVE..."; sleep 10; done + openstack server add floating ip myserver $(openstack floating ip create public --format json | jq .floating_ip_address | tr -d '"') + openstack server list + ``` diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index caaeb4e88..177157fe3 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -47,8 +47,8 @@ echo "*** [INFO] Working in project ${OCP_PROJECT}" echo "*** [INFO] Getting ElasticSearch authentication password" ELASTICSEARCH_AUTH_PASS=$(oc get secret elasticsearch-es-elastic-user -ogo-template='{{ .data.elastic | base64decode }}') -echo "*** [INFO] Getting Prometheus authentication password" -PROMETHEUS_AUTH_PASS=$(oc get secret default-prometheus-htpasswd -ogo-template='{{ .data.password | base64decode }}') +echo "*** [INFO] Getting Prometheus authentication token" +PROMETHEUS_AUTH_TOKEN=$(oc create token stf-prometheus-reader) echo "*** [INFO] Creating configmaps..." oc delete configmap/stf-smoketest-healthcheck-log configmap/stf-smoketest-collectd-config configmap/stf-smoketest-sensubility-config configmap/stf-smoketest-collectd-entrypoint-script configmap/stf-smoketest-ceilometer-publisher configmap/stf-smoketest-ceilometer-entrypoint-script job/stf-smoketest || true @@ -76,7 +76,7 @@ oc wait --for=jsonpath='{.status.phase}'=Running pod/qdr-test echo "*** [INFO] Creating smoketest jobs..." oc delete job -l app=stf-smoketest for NAME in "${CLOUDNAMES[@]}"; do - oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_PASS}/" ${REL}/smoketest_job.yaml.template) + oc create -f <(sed -e "s/<>/${NAME}/;s/<>/${ELASTICSEARCH_AUTH_PASS}/;s/<>/${PROMETHEUS_AUTH_TOKEN}/" ${REL}/smoketest_job.yaml.template) done echo "*** [INFO] Triggering an alertmanager notification..." diff --git a/tests/smoketest/smoketest_ceilometer_entrypoint.sh b/tests/smoketest/smoketest_ceilometer_entrypoint.sh index adf3a9046..0fc6f232f 100644 --- a/tests/smoketest/smoketest_ceilometer_entrypoint.sh +++ b/tests/smoketest/smoketest_ceilometer_entrypoint.sh @@ -5,7 +5,7 @@ set +e PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} ELASTICSEARCH=${ELASTICSEARCH:-"https://elasticsearch-es-http:9200"} ELASTICSEARCH_AUTH_PASS=${ELASTICSEARCH_AUTH_PASS:-""} -PROMETHEUS_AUTH_PASS=${PROMETHEUS_AUTH_PASS:-""} +PROMETHEUS_AUTH_TOKEN=${PROMETHEUS_AUTH_TOKEN:-""} CLOUDNAME=${CLOUDNAME:-"smoke1"} POD=$(hostname) @@ -20,14 +20,14 @@ echo "*** [INFO] Sleeping for 30 seconds to produce all metrics and events" sleep 30 echo "*** [INFO] List of metric names for debugging..." -curl -sk -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names +curl -sk -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent image metrics..." echo "[DEBUG] Running the curl command to return a query" -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=ceilometer_image_size' 2>&1 | grep '"result":\[{"metric":{"__name__":"ceilometer_image_size"' metrics_result=$? echo "[DEBUG] Set metrics_result to $metrics_result" diff --git a/tests/smoketest/smoketest_collectd_entrypoint.sh b/tests/smoketest/smoketest_collectd_entrypoint.sh index d7f5132e8..d0dd800c3 100755 --- a/tests/smoketest/smoketest_collectd_entrypoint.sh +++ b/tests/smoketest/smoketest_collectd_entrypoint.sh @@ -5,7 +5,7 @@ set +e PROMETHEUS=${PROMETHEUS:-"https://default-prometheus-proxy:9092"} ELASTICSEARCH=${ELASTICSEARCH:-"https://elasticsearch-es-http:9200"} ELASTICSEARCH_AUTH_PASS=${ELASTICSEARCH_AUTH_PASS:-""} -PROMETHEUS_AUTH_PASS=${PROMETHEUS_AUTH_PASS:-""} +PROMETHEUS_AUTH_TOKEN=${PROMETHEUS_AUTH_TOKEN:-""} CLOUDNAME=${CLOUDNAME:-"smoke1"} POD=$(hostname) @@ -37,12 +37,12 @@ sleep 30 echo "*** [INFO] List of metric names for debugging..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/label/__name__/values" 2>&2 | tee /tmp/label_names echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent CPU metrics..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=collectd_cpu_total{container="sg-core",plugin_instance="0",type_instance="user",service="default-cloud1-coll-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=collectd_cpu_total{container="sg-core",plugin_instance="0",type_instance="user",service="default-cloud1-coll-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output echo; echo # The egrep exit code is the result of the test and becomes the container/pod/job exit code @@ -53,7 +53,7 @@ echo; echo # Checks that the metrics actually appear in prometheus echo "*** [INFO] Checking for recent healthcheck metrics..." -curl -k -u "internal:${PROMETHEUS_AUTH_PASS}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=sensubility_container_health_status{container="sg-core",service="default-cloud1-sens-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output +curl -k -H "Authorization: Bearer ${PROMETHEUS_AUTH_TOKEN}" -g "${PROMETHEUS}/api/v1/query?" --data-urlencode 'query=sensubility_container_health_status{container="sg-core",service="default-cloud1-sens-meter",host="'"${POD}"'"}[1m]' 2>&2 | tee /tmp/query_output echo; echo # The egrep exit code is the result of the test and becomes the container/pod/job exit code diff --git a/tests/smoketest/smoketest_job.yaml.template b/tests/smoketest/smoketest_job.yaml.template index 4a9c20cc9..12626f3b2 100644 --- a/tests/smoketest/smoketest_job.yaml.template +++ b/tests/smoketest/smoketest_job.yaml.template @@ -22,8 +22,8 @@ spec: value: <> - name: ELASTICSEARCH_AUTH_PASS value: "<>" - - name: PROMETHEUS_AUTH_PASS - value: "<>" + - name: PROMETHEUS_AUTH_TOKEN + value: "<>" - name: OBSERVABILITY_STRATEGY value: "<>" volumeMounts: @@ -51,8 +51,8 @@ spec: value: <> - name: ELASTICSEARCH_AUTH_PASS value: "<>" - - name: PROMETHEUS_AUTH_PASS - value: "<>" + - name: PROMETHEUS_AUTH_TOKEN + value: "<>" - name: OBSERVABILITY_STRATEGY value: "<>" volumeMounts: From cf20fdf4a4f8b5034fbc6555c57ac07a23877d3f Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Thu, 15 Feb 2024 14:23:28 +0000 Subject: [PATCH 22/28] Fix typo in check for setup_from_bundles (#577) --- build/stf-run-ci/tasks/setup_stf_from_bundles.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml index f549fc209..cdb09be85 100644 --- a/build/stf-run-ci/tasks/setup_stf_from_bundles.yml +++ b/build/stf-run-ci/tasks/setup_stf_from_bundles.yml @@ -91,7 +91,7 @@ ansible.builtin.assert: that: - '__smart_gateway_bundle_image_path | default("") | length > 0' - - '__service_telemetry_bindle_image_path | default("") | length > 0' + - '__service_telemetry_bundle_image_path | default("") | length > 0' fail_msg: "Bundle path(s) not set. __smart_gateway_bundle_image_path is '{{ __smart_gateway_bundle_image_path }}' and __service_telemetry_bundle_image_path is '{{ __service_telemetry_bundle_image_path }}'. Both values need to be set." success_msg: "Bundle paths are defined, are not None and have a non-zero-length" From f39dd05226be380406fc923434c5186967992f5c Mon Sep 17 00:00:00 2001 From: Victoria Martinez de la Cruz Date: Fri, 16 Feb 2024 23:48:08 +0100 Subject: [PATCH 23/28] Add related images for Prometheus and Alertmanager (#578) (#580) * Add related images for Prometheus and Alertmanager Add support for RELATED_IMAGES for other artifacts we need in a default deployment when installing via disconnected. These images are required to be specifically called out in disconnected environments as the Prometheus and Alertmanager controllers in the Cluster Observability Operator (COO) don't use the RELATED_IMAGE environment variables specified in the downstream CSV for COO, so we need to specify them ourselves and then deploy Prometheus and Alertmanager using the spec.image reference in the Custom Resource. Closes: JIRA#STF-1713 * Place related images in correct deploy manifest * Use grafana_path_image by default Remove the baseImage configuration for Grafana from the example CR so that in the OLM UI (OCP console) the baseImage isn't a populated value by default. Use the value from grafana_image_path as the preferred default as it will be populated from the RELATED_IMAGES_GRAFANA_IMAGE environment variable, which then itself has a default of the base_image value in defaults/main.yml. Only if the administrator sets a specific value for baseImage should we deploy with that image. We do this to allow for a better disconnected environment experience out of the box. * Drop setting Grafana related images For upstream, there isn't a good option that is on quay.io (which doesn't have quotas), and since it's out of scope for this effort, I'm just going to drop it and revert back to how it worked before. * Revert removal of Grafana default image path * Fix issue with alertmanager related image tag var Co-authored-by: Leif Madsen --- build/generate_bundle.sh | 2 +- build/metadata.sh | 4 ++++ .../service-telemetry-operator.clusterserviceversion.yaml | 4 ++++ deploy/operator.yaml | 4 ++++ roles/servicetelemetry/tasks/pre.yml | 4 +++- roles/servicetelemetry/templates/manifest_alertmanager.j2 | 3 +++ roles/servicetelemetry/templates/manifest_prometheus.j2 | 1 + 7 files changed, 20 insertions(+), 2 deletions(-) diff --git a/build/generate_bundle.sh b/build/generate_bundle.sh index e169f3988..d6ad95361 100755 --- a/build/generate_bundle.sh +++ b/build/generate_bundle.sh @@ -29,7 +29,7 @@ generate_dockerfile() { } generate_bundle() { - REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" + REPLACE_REGEX="s#<>#${CREATED_DATE}#g;s#<>#${OPERATOR_IMAGE}#g;s#<>#${OPERATOR_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY}#g;s#<>#${RELATED_IMAGE_OAUTH_PROXY_TAG}#g;s#<>#${RELATED_IMAGE_PROMETHEUS}#g;s#<>#${RELATED_IMAGE_PROMETHEUS_TAG}#g;s#<>#${RELATED_IMAGE_ALERTMANAGER}#g;s#<>#${RELATED_IMAGE_ALERTMANAGER_TAG}#g;s#<>#${OPERATOR_BUNDLE_VERSION}#g;s#1.99.0#${OPERATOR_BUNDLE_VERSION}#g;s#<>#${OPERATOR_DOCUMENTATION_URL}#g;s#<>#${BUNDLE_OLM_SKIP_RANGE_LOWER_BOUND}#g" pushd "${REL}/../" > /dev/null 2>&1 ${OPERATOR_SDK} generate bundle --verbose --channels ${BUNDLE_CHANNELS} --default-channel ${BUNDLE_DEFAULT_CHANNEL} --manifests --metadata --version "${OPERATOR_BUNDLE_VERSION}" --output-dir "${WORKING_DIR}" >> ${LOGFILE} 2>&1 diff --git a/build/metadata.sh b/build/metadata.sh index 7b120b509..4054f6d6c 100644 --- a/build/metadata.sh +++ b/build/metadata.sh @@ -21,6 +21,10 @@ RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP:-q RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG=${RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_TAG:-stable-1.5} RELATED_IMAGE_OAUTH_PROXY=${RELATED_IMAGE_OAUTH_PROXY:-quay.io/openshift/origin-oauth-proxy} RELATED_IMAGE_OAUTH_PROXY_TAG=${RELATED_IMAGE_OAUTH_PROXY_TAG:-latest} +RELATED_IMAGE_PROMETHEUS=${RELATED_IMAGE_PROMETHEUS:-quay.io/prometheus/prometheus} +RELATED_IMAGE_PROMETHEUS_TAG=${RELATED_IMAGE_PROMETHEUS_TAG:-latest} +RELATED_IMAGE_ALERTMANAGER=${RELATED_IMAGE_ALERTMANAGER:-quay.io/prometheus/alertmanager} +RELATED_IMAGE_ALERTMANAGER_TAG=${RELATED_IMAGE_ALERTMANAGER_TAG:-latest} BUNDLE_PATH=${BUNDLE_PATH:-deploy/olm-catalog/service-telemetry-operator} BUNDLE_CHANNELS=${BUNDLE_CHANNELS:-stable-1.5} BUNDLE_DEFAULT_CHANNEL=${BUNDLE_DEFAULT_CHANNEL:-stable-1.5} diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 00cec8767..0392f58a3 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -295,6 +295,10 @@ spec: value: <>:<> - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE value: <>:<> + - name: RELATED_IMAGE_PROMETHEUS_IMAGE + value: <>:<> + - name: RELATED_IMAGE_ALERTMANAGER + value: <>:<> image: <>:<> imagePullPolicy: Always name: operator diff --git a/deploy/operator.yaml b/deploy/operator.yaml index c56c11daa..b82ea4cf5 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -37,6 +37,10 @@ spec: value: <>:<> - name: RELATED_IMAGE_OAUTH_PROXY_IMAGE value: <>:<> + - name: RELATED_IMAGE_PROMETHEUS_IMAGE + value: <>:<> + - name: RELATED_IMAGE_ALERTMANAGER + value: <>:<> volumes: - name: runner emptyDir: {} diff --git a/roles/servicetelemetry/tasks/pre.yml b/roles/servicetelemetry/tasks/pre.yml index 38477b02b..6b771dec0 100644 --- a/roles/servicetelemetry/tasks/pre.yml +++ b/roles/servicetelemetry/tasks/pre.yml @@ -33,8 +33,10 @@ - name: "Set supporting container image paths" set_fact: - prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + alertmanager_image_path: "{{ lookup('env', 'RELATED_IMAGE_ALERTMANAGER_IMAGE') | default('quay.io/prometheus/alertmanager:latest', true) }}" # noqa 204 oauth_proxy_image: "{{ lookup('env', 'RELATED_IMAGE_OAUTH_PROXY_IMAGE') | default('quay.io/openshift/origin-oauth-proxy:latest', true) }}" # noqa 204 + prometheus_webhook_snmp_container_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_WEBHOOK_SNMP_IMAGE') | default('quay.io/infrawatch/prometheus-webhook-snmp:latest', true) }}" # noqa 204 + prometheus_image_path: "{{ lookup('env', 'RELATED_IMAGE_PROMETHEUS_IMAGE') | default('quay.io/prometheus/prometheus:latest', true) }}" # noqa 204 - name: Adjust defaults when highAvailability.enabled is true block: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 5b53cc592..4e2287fe9 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -7,6 +7,9 @@ metadata: name: '{{ ansible_operator_meta.name }}' namespace: '{{ ansible_operator_meta.namespace }}' spec: +{% if observability_strategy != "use_community" %} + image: {{ alertmanager_image_path }} +{% endif %} replicas: {{ servicetelemetry_vars.alerting.alertmanager.deployment_size }} serviceAccountName: alertmanager-stf serviceMonitorSelector: diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index 66f2d5a8d..d9610b056 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -9,6 +9,7 @@ metadata: spec: {% if observability_strategy != "use_community" %} version: null + image: {{ prometheus_image_path }} {% else %} version: v2.43.0 {% endif %} From 5f3f9a081fcf214ec4cd17953330fc8b6af233d0 Mon Sep 17 00:00:00 2001 From: Leif Madsen Date: Mon, 26 Feb 2024 10:25:38 -0500 Subject: [PATCH 24/28] Fix env var naming issue (#581) (#582) * Fix env var naming issue Fix an environment variable naming issue in the CSV for STO when attempting to install alertmanager disconnected. The env var being looked up should have a postfix of _IMAGE to match the other env vars. Found in testing by vkmc. * Run operator-sdk generate bundle Run the generate bundle command to make sure everything is in sync. Fixes CI found issue in previous commit. (cherry-picked from commit 6568039b83e0caaa557af0a462407077e3c1176c) --- .../service-telemetry-operator.clusterserviceversion.yaml | 2 +- deploy/operator.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml index 0392f58a3..4ec0f7f5c 100644 --- a/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml +++ b/deploy/olm-catalog/service-telemetry-operator/manifests/service-telemetry-operator.clusterserviceversion.yaml @@ -297,7 +297,7 @@ spec: value: <>:<> - name: RELATED_IMAGE_PROMETHEUS_IMAGE value: <>:<> - - name: RELATED_IMAGE_ALERTMANAGER + - name: RELATED_IMAGE_ALERTMANAGER_IMAGE value: <>:<> image: <>:<> imagePullPolicy: Always diff --git a/deploy/operator.yaml b/deploy/operator.yaml index b82ea4cf5..a6a940791 100644 --- a/deploy/operator.yaml +++ b/deploy/operator.yaml @@ -39,7 +39,7 @@ spec: value: <>:<> - name: RELATED_IMAGE_PROMETHEUS_IMAGE value: <>:<> - - name: RELATED_IMAGE_ALERTMANAGER + - name: RELATED_IMAGE_ALERTMANAGER_IMAGE value: <>:<> volumes: - name: runner From eb7d5b0e6836efb9c7107c44473ed435cc7a00cf Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 1 Mar 2024 15:49:56 +0000 Subject: [PATCH 25/28] Update smoketest.sh (#583) (#584) Changed default-alertmanager URL to v2 Co-authored-by: Alex Yefimov <126113326+ayefimov-1@users.noreply.github.com> --- tests/smoketest/smoketest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/smoketest/smoketest.sh b/tests/smoketest/smoketest.sh index 177157fe3..b4a8db29f 100755 --- a/tests/smoketest/smoketest.sh +++ b/tests/smoketest/smoketest.sh @@ -90,7 +90,7 @@ fi # create the alert using startsAt which in theory may cause trigger to be faster echo "*** [INFO] Create alert" -oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v1/alerts" +oc delete pod -l run=curl ; oc run curl --wait --restart='Never' --image=quay.io/infrawatch/busyboxplus:curl -- sh -c "curl -v -k -H \"Content-Type: application/json\" -H \"Authorization: Bearer ${PROMETHEUS_K8S_TOKEN}\" -d '[{\"status\":\"firing\",\"labels\":{\"alertname\":\"smoketest\",\"severity\":\"warning\"},\"startsAt\":\"$(date --rfc-3339=seconds | sed 's/ /T/')\"}]' https://default-alertmanager-proxy:9095/api/v2/alerts" oc wait --for=jsonpath='{.status.phase}'=Succeeded pod/curl oc logs curl From cbefc68eae5a3ecbd40f1e48abc7a3a6dc61104c Mon Sep 17 00:00:00 2001 From: Emma Foley Date: Fri, 31 May 2024 13:29:42 +0100 Subject: [PATCH 26/28] [zuul] Remove OCP 4.13 jobs (#602) OCP 4.13 jobs were added as a way to test ocp-latest before 4.14 was released --- .zuul.yaml | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index a418c6506..636f0cf44 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -7,14 +7,6 @@ - name: crc label: coreos-crc-extracted-2-19-0-xxl -- nodeset: - name: stf-crc_extracted-ocp413 - nodes: - - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost - - name: crc - label: coreos-crc-extracted-2-28-0-xxl - - nodeset: name: stf-crc_extracted-ocp414 nodes: @@ -136,13 +128,6 @@ Deploy STF using the nightly bundles on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-nightly_bundles - parent: stf-crc-nightly_bundles - description: | - Deploy STF using the nightly bundles on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-nightly_bundles parent: stf-crc-nightly_bundles @@ -157,13 +142,6 @@ Build images locally and deploy STF on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-local_build - parent: stf-crc-local_build - description: | - Build images locally and deploy STF on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-local_build parent: stf-crc-local_build @@ -178,13 +156,6 @@ Build STF locally and deploy from index on OCP 4.12 nodeset: stf-crc_extracted-ocp412 -- job: - name: stf-crc-ocp_413-local_build-index_deploy - parent: stf-crc-local_build-index_deploy - description: | - Build STF locally and deploy from index on OCP 4.13 - nodeset: stf-crc_extracted-ocp413 - - job: name: stf-crc-ocp_414-local_build-index_deploy parent: stf-crc-local_build-index_deploy @@ -199,10 +170,8 @@ github-check: jobs: - stf-crc-ocp_412-local_build - - stf-crc-ocp_413-local_build - stf-crc-ocp_414-local_build - stf-crc-ocp_412-local_build-index_deploy - - stf-crc-ocp_413-local_build-index_deploy - stf-crc-ocp_414-local_build-index_deploy - project: @@ -212,5 +181,4 @@ periodic: jobs: - stf-crc-ocp_412-nightly_bundles - - stf-crc-ocp_413-nightly_bundles - stf-crc-ocp_414-nightly_bundles From 3b6b1e11711708a73cbafee2abb9c4d58a15a964 Mon Sep 17 00:00:00 2001 From: Daniel Pawlik <3049495+danpawlik@users.noreply.github.com> Date: Wed, 17 Jul 2024 19:19:48 +0200 Subject: [PATCH 27/28] Update nodesets label (#614) The CI jobs does not need to be force use one cloud provider in the CI. This commit set the label that is defined in multiple cloud provider in Zuul. Available labels you can find [1]. [1] https://review.rdoproject.org/zuul/labels --- .zuul.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.zuul.yaml b/.zuul.yaml index 636f0cf44..c737b6b52 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -3,7 +3,7 @@ name: stf-crc_extracted-ocp412 nodes: - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost + label: cloud-centos-9-stream-tripleo - name: crc label: coreos-crc-extracted-2-19-0-xxl @@ -11,7 +11,7 @@ name: stf-crc_extracted-ocp414 nodes: - name: controller - label: cloud-centos-9-stream-tripleo-vexxhost + label: cloud-centos-9-stream-tripleo - name: crc label: coreos-crc-extracted-2-30-0-xxl From 2daaf950aedec7fdeb74156f86210cd1b22ce1ec Mon Sep 17 00:00:00 2001 From: Chris Sibbitt Date: Thu, 18 Jul 2024 05:24:14 -0400 Subject: [PATCH 28/28] Fix oauth SARs for interactive login (#612) - Doesn't work unless resource name is plural form - The "group" property is called "resourceAPIGroup" in SARs -"to avoid confusion with the 'groups' field when inlined"[1] [1] https://docs.openshift.com/container-platform/4.14/rest_api/authorization_apis/subjectaccessreview-authorization-openshift-io-v1.html --- roles/servicetelemetry/tasks/component_prometheus_reader.yml | 2 +- roles/servicetelemetry/templates/manifest_alertmanager.j2 | 2 +- roles/servicetelemetry/templates/manifest_grafana.j2 | 2 +- roles/servicetelemetry/templates/manifest_grafana_v5.j2 | 2 +- roles/servicetelemetry/templates/manifest_prometheus.j2 | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/roles/servicetelemetry/tasks/component_prometheus_reader.yml b/roles/servicetelemetry/tasks/component_prometheus_reader.yml index 6cbee8b42..c86f021fc 100644 --- a/roles/servicetelemetry/tasks/component_prometheus_reader.yml +++ b/roles/servicetelemetry/tasks/component_prometheus_reader.yml @@ -21,7 +21,7 @@ - apiGroups: - '{{ prometheus_operator_api_string | replace("/v1","") }}' resources: - - prometheus + - prometheuses verbs: - get namespaces: diff --git a/roles/servicetelemetry/templates/manifest_alertmanager.j2 b/roles/servicetelemetry/templates/manifest_alertmanager.j2 index 4e2287fe9..f70bb8e1f 100644 --- a/roles/servicetelemetry/templates/manifest_alertmanager.j2 +++ b/roles/servicetelemetry/templates/manifest_alertmanager.j2 @@ -29,7 +29,7 @@ spec: - -upstream=http://localhost:9093/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=alertmanager-stf - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "resourceAPIGroup":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' - '-openshift-delegate-urls={"/": {"namespace":"{{ ansible_operator_meta.namespace }}", "resource": "alertmanagers", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9095 diff --git a/roles/servicetelemetry/templates/manifest_grafana.j2 b/roles/servicetelemetry/templates/manifest_grafana.j2 index 8b176b103..b7fdd6a4c 100644 --- a/roles/servicetelemetry/templates/manifest_grafana.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana.j2 @@ -42,7 +42,7 @@ spec: - -upstream=http://localhost:3000 - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=grafana-serviceaccount - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"integreatly.org", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafanas", "resourceAPIGroup":"integreatly.org", "verb":"get"}' - -openshift-ca=/etc/pki/tls/cert.pem - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt ports: diff --git a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 index 278e452ff..4c775c411 100644 --- a/roles/servicetelemetry/templates/manifest_grafana_v5.j2 +++ b/roles/servicetelemetry/templates/manifest_grafana_v5.j2 @@ -61,7 +61,7 @@ spec: - '-https-address=:3002' - '-http-address=' - '-email-domain=*' - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafana", "group":"grafana.integreatly.org", "verb":"get"}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "grafanas", "resourceAPIGroup":"grafana.integreatly.org", "verb":"get"}' - '-upstream=http://localhost:3000' - '-tls-cert=/etc/tls/private/tls.crt' - '-tls-key=/etc/tls/private/tls.key' diff --git a/roles/servicetelemetry/templates/manifest_prometheus.j2 b/roles/servicetelemetry/templates/manifest_prometheus.j2 index d9610b056..533161080 100644 --- a/roles/servicetelemetry/templates/manifest_prometheus.j2 +++ b/roles/servicetelemetry/templates/manifest_prometheus.j2 @@ -48,8 +48,8 @@ spec: - -upstream=http://localhost:9090/ - -cookie-secret-file=/etc/proxy/secrets/session_secret - -openshift-service-account=prometheus-stf - - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' - - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheus", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' + - '-openshift-sar={"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheuses", "resourceAPIGroup":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}' + - '-openshift-delegate-urls={"/":{"namespace":"{{ ansible_operator_meta.namespace }}","resource": "prometheuses", "group":"{{ prometheus_operator_api_string | replace("/v1","") }}", "verb":"get"}}' ports: - containerPort: 9092