Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: remove dependency on KFP lib + eval serving endpoint support + ci #76

Merged
merged 7 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/pre_commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,9 @@ jobs:
- name: Run pre-commit
run: |
pre-commit run --all-files

- name: Test if pipeline is up-to-date
run: |
pip install click kfp==2.9.0 kfp.kubernetes
make pipeline
git diff --exit-code || (echo "Pipeline is not up-to-date. Please run 'make pipeline' and commit the changes." && exit 1)
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
.PHONY: standalone
.PHONY: standalone pipeline

standalone:
python3 pipeline.py gen-standalone
ruff format standalone/standalone.py

pipeline:
python3 pipeline.py
13 changes: 9 additions & 4 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
candidate_model: str,
base_model_dir: str,
tasks: Input[Dataset],
taxonomy: Input[Dataset],
Expand All @@ -29,6 +28,7 @@ def run_final_eval_op(
few_shots: int,
batch_size: int,
merge_system_user_message: bool,
candidate_model: str = None,
):
import json
import os
Expand All @@ -43,6 +43,11 @@ def run_final_eval_op(
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

# For standalone mode
if candidate_model is None:
# logic to get the best model from the models folder and results
pass

######################################################################
# branch_eval_summary_to_json creates a json object from output of instructlab/eval
# TODO: Add this to the instructlab/eval or instructlab/instructlab repository
Expand Down Expand Up @@ -221,7 +226,7 @@ def find_node_dataset_directories(base_directory: str):

######################################################################
# TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
# and when that happens, much of this logic can be imported from the `evaluate` definition:
# and when that happens, much of this logic can be imported from the 'evaluate' definition:
# https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
#
# With instructlab, model_name is synonomous with model_path
Expand All @@ -244,8 +249,8 @@ def find_node_dataset_directories(base_directory: str):
),
]

# ilab/evaluate uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# ilab/evaluate uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
98 changes: 89 additions & 9 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def run_mt_bench_op(
models_path_prefix: str,
mt_bench_output: Output[Artifact],
merge_system_user_message: bool,
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
max_workers: str,
models_list: List[str] = None,
Expand All @@ -24,13 +24,93 @@ def run_mt_bench_op(
import os

import torch
from helpers import (
VLLM_SERVER,
launch_vllm,
stop_vllm,
)
from instructlab.eval.mt_bench import MTBenchEvaluator

VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
):
import subprocess
import sys
import time

import requests

if gpu_count > 0:
command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
"--tensor-parallel-size",
str(gpu_count),
]
else:
command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
]

subprocess.Popen(args=command)

print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")

for attempt in range(retries):
try:
response = requests.get(f"{VLLM_SERVER}/models")
if response.status_code == 200:
print(f"vLLM server is up and running at {VLLM_SERVER}.")
return
except requests.ConnectionError:
pass

print(
f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
)
time.sleep(delay)

raise RuntimeError(
f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
)

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
def stop_vllm():
import psutil

for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(
f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
)
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(
f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
)
process.kill() # Force kill if it's still running
print(
f"Successfully stopped vLLM server with PID: {process.info['pid']}"
)
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(
f"Access denied when trying to terminate process with PID {process.info['pid']}."
)
except Exception as e:
print(
f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gpu_available = torch.cuda.is_available()
Expand All @@ -53,8 +133,8 @@ def run_mt_bench_op(
scores = {}
all_mt_bench_data = []

# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
Loading