DataDog · emmettbutler · Oct 15, 2024 · Oct 15, 2024 · Oct 16, 2024 · Oct 17, 2024
@@ -189,7 +189,7 @@ onboarding_python:
             test-app-python-container-3.11-alpine,
             test-app-python-container-3.12-alpine,
           ]
-          SCENARIO: [INSTALLER_AUTO_INJECTION]
+          SCENARIO: [INSTALLER_AUTO_INJECTION, CONTAINER_AUTO_INJECTION_INSTALL_SCRIPT_PROFILING, CONTAINER_AUTO_INJECTION_INSTALL_SCRIPT_CRASHTRACKING]
         - ONBOARDING_FILTER_ENV: [dev, prod]
           ONBOARDING_FILTER_WEBLOG: [test-app-python]
           SCENARIO: [INSTALLER_AUTO_INJECTION_LD_PRELOAD]

@@ -24,6 +24,13 @@ def index(request):
     return HttpResponse("test")
 
 
+def crashme(request):
+    import ctypes
+
+    ctypes.string_at(0)
+
+
 urlpatterns = [
     path("", index),
+    path("crashme", crashme),
 ]
@@ -1,6 +1,5 @@
 from utils import scenarios, features, flaky
 from utils.tools import logger
-from utils import scenarios, features
 import tests.auto_inject.utils as base
 from utils.virtual_machine.utils import parametrize_virtual_machines
 
@@ -83,6 +82,13 @@ def test_install(self, virtual_machine):
         self._test_install(virtual_machine, profile=True)
 
 
+@scenarios.container_auto_injection_install_script_crashtracking
+class TestContainerAutoInjectInstallScriptCrashTracking(base.AutoInjectBaseTest):
+    @parametrize_virtual_machines()
+    def test_install(self, virtual_machine):
+        self._test_install(virtual_machine, crashlog=True)
+
+
 @features.installer_auto_instrumentation
 @scenarios.installer_auto_injection
 class TestInstallerAutoInjectManual(base.AutoInjectBaseTest):

@@ -1,17 +1,15 @@
-import os
-import pytest
-import paramiko
 from utils.tools import logger
-from utils.onboarding.weblog_interface import make_get_request, warmup_weblog, make_internal_get_request
+from utils.onboarding.weblog_interface import make_get_request, warmup_weblog, request_weblog
 from utils.onboarding.backend_interface import wait_backend_trace_id
+from utils.onboarding.backend_interface import cause_and_verify_crash
 from utils.onboarding.wait_for_tcp_port import wait_for_port
 from utils.virtual_machine.vm_logger import vm_logger
 from utils import context
 from threading import Timer
 
 
 class AutoInjectBaseTest:
-    def _test_install(self, virtual_machine, profile: bool = False):
+    def _test_install(self, virtual_machine, profile: bool = False, crashlog: bool = False):
         """ We can easily install agent and lib injection software from agent installation script. Given a  sample application we can enable tracing using local environment variables.
             After starting application we can see application HTTP requests traces in the backend.
             Using the agent installation script we can install different versions of the software (release or beta) in different OS."""
@@ -21,22 +19,13 @@ def _test_install(self, virtual_machine, profile: bool = False):
         vm_logger(context.scenario.name, virtual_machine.name).info(
             f"{header} \n {header}  \n  Launching the uninstall for VM: {virtual_machine.name}  \n {header} \n {header}"
         )
-        request_uuid = None
-        if virtual_machine.krunvm_config is not None and virtual_machine.krunvm_config.stdin is not None:
-            logger.info(
-                f"We are testing on krunvm. The request to the weblog will be done using the stdin (inside the microvm)"
-            )
-            request_uuid = make_internal_get_request(virtual_machine.krunvm_config.stdin, vm_port)
-        else:
-            logger.info(f"Waiting for weblog available [{vm_ip}:{vm_port}]")
-            wait_for_port(vm_port, vm_ip, 80.0)
-            logger.info(f"[{vm_ip}]: Weblog app is ready!")
-            warmup_weblog(f"http://{vm_ip}:{vm_port}/")
-            logger.info(f"Making a request to weblog [{vm_ip}:{vm_port}]")
-            request_uuid = make_get_request(f"http://{vm_ip}:{vm_port}/")
+        vm_name = virtual_machine.name
+        request_uuid = request_weblog(virtual_machine, vm_ip, vm_port)
 
         logger.info(f"Http request done with uuid: [{request_uuid}] for ip [{vm_ip}]")
-        wait_backend_trace_id(request_uuid, 120.0, profile=profile)
+        runtime_id = wait_backend_trace_id(request_uuid, 120.0, profile=profile)
+        if crashlog:
+            cause_and_verify_crash(runtime_id, vm_ip, vm_port)
 
     def close_channel(self, channel):
         try:

@@ -1,4 +1,3 @@
-import os
 import json
 
 import pytest
@@ -618,6 +617,14 @@ def all_endtoend_scenarios(test_object):
         github_workflow="libinjection",
     )
 
+    container_auto_injection_install_script_crashtracking = InstallerAutoInjectionScenario(
+        "CONTAINER_AUTO_INJECTION_INSTALL_SCRIPT_CRASHTRACKING",
+        "Onboarding Container Single Step Instrumentation crashtracking scenario using agent auto install script",
+        vm_provision="container-auto-inject-install-script",
+        scenario_groups=[ScenarioGroup.ONBOARDING],
+        github_workflow="libinjection",
+    )
+
     host_auto_injection_install_script = InstallerAutoInjectionScenario(
         "HOST_AUTO_INJECTION_INSTALL_SCRIPT",
         "Onboarding Host Single Step Instrumentation scenario using agent auto install script",

@@ -1,8 +1,12 @@
+import functools
 import os
 import time
+from typing import Callable
+from typing import Optional
 from datetime import datetime, timedelta, timezone
 import requests
 from utils.tools import logger
+from utils.onboarding.weblog_interface import make_get_request
 
 
 def _headers():
@@ -69,31 +73,83 @@ def _query_for_profile(runtime_id):
             data = r.json()["data"]
             # Check if we got any profile events
             if isinstance(data, list) and len(data) > 0:
-                return r.status_code
-            return -1
+                return (r.status_code,)
+            return (-1,)
         return r.status_code
     except Exception as e:
         logger.error(f"Error received connecting to host: [{host}] {e} ")
-        return -1
+        return (-1,)
 
 
-def wait_backend_trace_id(trace_id, timeout: float = 5.0, profile: bool = False, validator=None):
+def _query_for_crash_log(runtime_id):
+    path = "/api/v2/logs/events/search"
+    host = "https://api.datadoghq.com"
+    try:
+        time_to = datetime.now(timezone.utc)
+        time_from = time_to - timedelta(minutes=10)
+
+        queryJson = {
+            "filter": {
+                "from": time_from.isoformat(timespec="seconds"),
+                "to": time_to.isoformat(timespec="seconds"),
+                "query": f'service:instrumentation-telemetry-data (@tags.severity:crash OR severity:crash OR signum:*) @metadata.tags:"runtime-id:{runtime_id}"',
+            },
+        }
+        logger.debug(f"Posting to {host}{path} with query: {queryJson}")
+        headers = _headers()
+        headers["Content-Type"] = "application/json"
+        r = requests.post(f"{host}{path}", headers=headers, timeout=10, json=queryJson)
+        logger.debug(f" Backend response status for crash events for runtime [{runtime_id}]: [{r.status_code}]")
+        if r.status_code == 200:
+            logger.debug(f" Backend response for crash events for runtime [{runtime_id}]: [{r.text}]")
+            data = r.json()["data"]
+            if isinstance(data, list) and len(data) > 0:
+                return (r.status_code,)
+            return (-1,)
+        return r.status_code
+    except Exception as e:
+        logger.error(f"Error received connecting to host: [{host}] {e} ")
+        return (-1,)
+
+
+def _retry_request_until_timeout(request_fn: Callable, timeout: float = 5.0):
     start_time = time.perf_counter()
     while True:
-        status, runtime_id = _query_for_trace_id(trace_id, validator=validator)
-        if status != 200:
+        return_value = request_fn()
+        if return_value[0] != 200:
             time.sleep(2)
         else:
-            logger.info(f"trace [{trace_id}] found in the backend!")
-            if profile:
-                while True:
-                    if _query_for_profile(runtime_id) != 200:
-                        time.sleep(2)
-                    else:
-                        logger.info(f"profile for trace [{trace_id}] (runtime [{runtime_id}]) found in the backend!")
-                        break
-                    if time.perf_counter() - start_time >= timeout:
-                        raise TimeoutError("Backend timeout waiting for profile")
             break
         if time.perf_counter() - start_time >= timeout:
-            raise TimeoutError("Backend timeout waiting for trace")
+            raise TimeoutError("Backend timeout")
+    return return_value
+
+
+def wait_backend_data(
+    trace_id=None,
+    timeout: float = 5.0,
+    profile: bool = False,
+    appsec: bool = False,
+    crashlog: bool = False,
+    validator=None,
+) -> Optional[str]:
+    runtime_id = None
+    if trace_id is not None:
+        status, runtime_id = _retry_request_until_timeout(
+            functools.partial(_query_for_trace_id, trace_id, validator=validator), timeout=10.0
+        )
+        logger.info(f"trace [{trace_id}] found in the backend!")
+    if profile and runtime_id is not None:
+        (status,) = _retry_request_until_timeout(functools.partial(_query_for_profile, runtime_id))
+        logger.info(f"profile for trace [{trace_id}] (runtime [{runtime_id}]) found in the backend!")
+    return runtime_id
+
+
+wait_backend_trace_id = wait_backend_data
+
+
+def cause_and_verify_crash(runtime_id: str, vm_ip: str, vm_port: str):
+    logger.info(f"Making a crash-inducing request to weblog [{vm_ip}:{vm_port}]")
+    make_get_request(f"http://{vm_ip}:{vm_port}/crashme", swallow=True)
+    (status,) = _retry_request_until_timeout(functools.partial(_query_for_crash_log, runtime_id), timeout=600.0)
+    logger.info(f"crash from runtime {runtime_id} found in the backend!")
@@ -2,19 +2,27 @@
 from random import randint
 import os
 import requests
+from utils.onboarding.wait_for_tcp_port import wait_for_port
+from utils.tools import logger
 
 
-def make_get_request(app_url):
+def make_get_request(app_url, swallow: bool = False) -> str:
     generated_uuid = str(randint(1, 100000000000000000))
-    requests.get(
-        app_url,
-        headers={
-            "x-datadog-trace-id": generated_uuid,
-            "x-datadog-parent-id": generated_uuid,
-            "x-datadog-sampling-priority": "2",
-        },
-        timeout=10,
-    )
+    try:
+        requests.get(
+            app_url,
+            headers={
+                "x-datadog-trace-id": generated_uuid,
+                "x-datadog-parent-id": generated_uuid,
+                "x-datadog-sampling-priority": "2",
+            },
+            timeout=10,
+        )
+    except Exception as e:
+        if not swallow:
+            raise
+        else:
+            logger.warning(e)
     return generated_uuid
 
 
@@ -28,13 +36,13 @@ def warmup_weblog(app_url):
 
 
 def make_internal_get_request(stdin_file, vm_port):
-    """ This method is exclusively for testing through KrunVm microVM. 
+    """ This method is exclusively for testing through KrunVm microVM.
     It is used to make a request to the weblog application inside the VM, using stdin file"""
 
     generated_uuid = str(randint(1, 100000000000000000))
     timeout = 80
     script_to_run = f"""#!/bin/bash
-echo "Requesting weblog..." 
+echo "Requesting weblog..."
 URL="http://localhost:{vm_port}/"
 TIMEOUT={timeout}
 TRACE_ID={generated_uuid}
@@ -74,3 +82,19 @@ def make_internal_get_request(stdin_file, vm_port):
         raise TimeoutError("Timed out waiting for weblog ready")
 
     return generated_uuid
+
+
+def request_weblog(virtual_machine, vm_ip, vm_port) -> str:
+    if virtual_machine.krunvm_config is not None and virtual_machine.krunvm_config.stdin is not None:
+        logger.info(
+            "We are testing on krunvm. The request to the weblog will be done using the stdin (inside the microvm)"
+        )
+        request_uuid = make_internal_get_request(virtual_machine.krunvm_config.stdin, vm_port)
+    else:
+        logger.info(f"Waiting for weblog available [{vm_ip}:{vm_port}]")
+        wait_for_port(vm_port, vm_ip, 80.0)
+        logger.info(f"[{vm_ip}]: Weblog app is ready!")
+        warmup_weblog(f"http://{vm_ip}:{vm_port}/")
+        logger.info(f"Making a request to weblog [{vm_ip}:{vm_port}]")
+        request_uuid = make_get_request(f"http://{vm_ip}:{vm_port}/")
+    return request_uuid