redhat-et · MichaelClifford · Oct 10, 2024 · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/.github/workflows/pre_commit.yaml b/.github/workflows/pre_commit.yaml
@@ -34,3 +34,9 @@ jobs:
       - name: Run pre-commit
         run: |
           pre-commit run --all-files
+
+      - name: Test if pipeline is up-to-date
+        run: |
+          pip install click kfp==2.9.0 kfp.kubernetes
+          make pipeline
+          git diff --exit-code || (echo "Pipeline is not up-to-date. Please run 'make pipeline' and commit the changes." && exit 1)
diff --git a/Makefile b/Makefile
@@ -1,5 +1,8 @@
-.PHONY: standalone
+.PHONY: standalone pipeline
 
 standalone:
 	python3 pipeline.py gen-standalone
 	ruff format standalone/standalone.py
+
+pipeline:
+	python3 pipeline.py
diff --git a/eval/final/components.py b/eval/final/components.py
@@ -17,7 +17,6 @@
 def run_final_eval_op(
     mmlu_branch_output: Output[Artifact],
     mt_bench_branch_output: Output[Artifact],
-    candidate_model: str,
     base_model_dir: str,
     tasks: Input[Dataset],
     taxonomy: Input[Dataset],
@@ -29,6 +28,7 @@ def run_final_eval_op(
     few_shots: int,
     batch_size: int,
     merge_system_user_message: bool,
+    candidate_model: str = None,
 ):
     import json
     import os
@@ -43,6 +43,11 @@ def run_final_eval_op(
     from instructlab.eval.mt_bench import MTBenchBranchEvaluator
     from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score
 
+    # For standalone mode
+    if candidate_model is None:
+        # logic to get the best model from the models folder and results
+        pass
+
     ######################################################################
     # branch_eval_summary_to_json creates a json object from output of instructlab/eval
     # TODO: Add this to the instructlab/eval or instructlab/instructlab repository
@@ -221,7 +226,7 @@ def find_node_dataset_directories(base_directory: str):
 
     ######################################################################
     # TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
-    # and when that happens, much of this logic can be imported from the `evaluate` definition:
+    # and when that happens, much of this logic can be imported from the 'evaluate' definition:
     # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
     #
     # With instructlab, model_name is synonomous with model_path
@@ -244,8 +249,8 @@ def find_node_dataset_directories(base_directory: str):
         ),
     ]
 
-    # ilab/evaluate uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # ilab/evaluate uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     if max_workers == "auto":
         try:

diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
@@ -12,8 +12,8 @@ def run_mt_bench_op(
     models_path_prefix: str,
     mt_bench_output: Output[Artifact],
     merge_system_user_message: bool,
-    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     max_workers: str,
     models_list: List[str] = None,
@@ -24,13 +24,93 @@ def run_mt_bench_op(
     import os
 
     import torch
-    from helpers import (
-        VLLM_SERVER,
-        launch_vllm,
-        stop_vllm,
-    )
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
+    VLLM_SERVER = "http://localhost:8000/v1"
+
+    def launch_vllm(
+        model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
+    ):
+        import subprocess
+        import sys
+        import time
+
+        import requests
+
+        if gpu_count > 0:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
+            ]
+        else:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+            ]
+
+        subprocess.Popen(args=command)
+
+        print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
+
+        for attempt in range(retries):
+            try:
+                response = requests.get(f"{VLLM_SERVER}/models")
+                if response.status_code == 200:
+                    print(f"vLLM server is up and running at {VLLM_SERVER}.")
+                    return
+            except requests.ConnectionError:
+                pass
+
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
+            time.sleep(delay)
+
+        raise RuntimeError(
+            f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
+        )
+
+    # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
+    # Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
+    def stop_vllm():
+        import psutil
+
+        for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
+            cmdline = process.info.get("cmdline")
+            if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
+                try:
+                    process.terminate()  # Try graceful termination
+                    process.wait(timeout=5)  # Wait a bit for it to terminate
+                    if process.is_running():
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
+                        process.kill()  # Force kill if it's still running
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
+                except psutil.NoSuchProcess:
+                    print(f"Process with PID {process.info['pid']} no longer exists.")
+                except psutil.AccessDenied:
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
+                except Exception as e:
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
+
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     gpu_available = torch.cuda.is_available()
@@ -53,8 +133,8 @@ def run_mt_bench_op(
     scores = {}
     all_mt_bench_data = []
 
-    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     if max_workers == "auto":
         try: