From 34de7d8a3621eef02b79706d8235799245e7ecd3 Mon Sep 17 00:00:00 2001
From: Lanture1064 <lanture1064@gmail.com>
Date: Tue, 9 Jan 2024 14:14:53 +0800
Subject: [PATCH] feat: add ragas_cli pip pkg

---
 evaluation/README.md       | 54 +++++++++++++++++++++
 evaluation/ragas-sample.py | 88 ----------------------------------
 evaluation/run/run.py      | 48 +++++++++++++++++++
 evaluation/setup.py        | 27 +++++++++++
 evaluation/src/pkg.py      | 96 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 225 insertions(+), 88 deletions(-)
 create mode 100644 evaluation/README.md
 delete mode 100644 evaluation/ragas-sample.py
 create mode 100644 evaluation/run/run.py
 create mode 100644 evaluation/setup.py
 create mode 100644 evaluation/src/pkg.py

diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 000000000..c9f5145de
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,54 @@
+# Ragas CLI
+
+A one-step Ragas cli tool to evaluate QCAG testsets generated by RAG apps. (Q = Question, C = Contexts, A = Answer, G = Ground_truth)
+
+## Install with pip
+
+```bash
+pip install ragacli
+```
+
+## Arguments
+
+- `--model`: Specifies the model to use for evaluation.
+    - Default value is "gpt-3.5-turbo". Langchain compatible.
+- `--api_base`: Specifies the base URL for the API.
+    - Default value is "https://api.openai.com/v1".
+- `--api_key`: Specifies the API key to authenticate requests. 
+    - Not required if using psuedo-openai API server, e.g. vLLM, Fastchat, etc.
+- `--embeddings`: Specifies the Huggingface embeddings model to use for evaluation. 
+    - Embeddings will run **locally**.
+    - Will use OpenAI embeddings if not set.
+    - Better set if using psuedo-openai API server.
+- `--metrics`: Specifies the metrics to use for evaluation.
+    - Will use Ragas default metrics if not set.
+    - Default metrics: `["answer_relevancy", "context_precision", "faithfulness", "context_recall", "context_relevancy"]`
+    - Other metrics: `"answer_similarity", "answer_correctness"`
+- `--dataset`: Specifies the path to the dataset for evaluation.    
+    - Dataset format must meet RAGAS requirements.
+    - Will use fiqa dataset as demo if not set.
+
+## Usage
+
+### Fiqa dataset demo:
+
+```bash
+python3 -m ragacli --api_key "YOUR_OPENAI_API_KEY"
+```
+
+### Evaluate with GPT-4 and `BAAI/bge-small-en` embeddings
+
+The huggingface embeddings will run locally, so **Make sure your machine works and have [sentence-transformers](https://pypi.org/project/sentence-transformers/) installed:**
+
+```bash
+pip install sentence-transformers
+```
+Then run:
+
+```bash
+python3 -m ragacli --model "gpt-4" --api_key "YOUR_OPENAI_API_KEY" --embeddings "BAAI/bge-small-en" --dataset "path/to/dataset.csv"
+```
+
+### Prepare Dataset
+
+See [**Ragas documentation**](https://docs.ragas.io/en/stable/howtos/applications/data_preparation.html)
\ No newline at end of file
diff --git a/evaluation/ragas-sample.py b/evaluation/ragas-sample.py
deleted file mode 100644
index b774a5bbc..000000000
--- a/evaluation/ragas-sample.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from ragas import evaluate
-from datasets import Dataset
-from ragas.llms import LangchainLLM
-from langchain.chat_models import ChatOpenAI
-from langchain.embeddings import OpenAIEmbeddings
-# create Langchain instance
-chat = ChatOpenAI(
-    model="a3e0c8a6-101c-4000-a1cd-d523ff7f521d",
-    openai_api_key="key",
-    openai_api_base="http://fastchat-api.172.22.96.167.nip.io/v1",
-    max_tokens=100,
-    temperature=0.5,
-)
-
-embedding = OpenAIEmbeddings(
-    model="a3e0c8a6-101c-4000-a1cd-d523ff7f521d",
-    openai_api_key="key",
-    openai_api_base="http://fastchat-api.172.22.96.167.nip.io/v1"
-)
-
-# use the Ragas LangchainLLM wrapper to create a RagasLLM instance
-llm = LangchainLLM(llm=chat)
-
-from ragas.metrics import ContextPrecision, ContextRecall, ContextRelevancy
-from ragas.metrics import AnswerCorrectness,AnswerRelevancy,AnswerSimilarity,Faithfulness
-
-# change the LLM
-
-context_precision = ContextPrecision(llm=llm)
-context_recall = ContextRecall(llm=llm)
-context_relevancy = ContextRelevancy(llm=llm)
-answer_relevancy = AnswerRelevancy(llm=llm,embeddings=embedding)
-answer_similarity = AnswerSimilarity(llm=llm,embeddings=embedding,is_cross_encoder=True)
-answer_correctness = AnswerCorrectness(llm=llm,answer_similarity=answer_similarity)
-faithfulness = Faithfulness(llm=llm)
-
-m = [context_precision,context_recall,context_relevancy,
-    answer_relevancy,answer_correctness,faithfulness,answer_similarity]
-
-q = [
-    "部门负责人在考勤管理中有哪些权利和义务？",
-    "公司对迟到打卡有哪些规定？",
-    "员工请假需要提前多久申请？"
-]
-
-c1 = [
-    "员工应严格遵守工作纪律并规范执行。",
-    "部门负责人在权限范围内有审批部门员工考勤记录的权利和严肃考勤纪律的义务，并以身作则，规范执行。",
-    "人力资源部负责考勤信息的记录、汇总和监督考勤制度的执行。"
-]
-
-c2 = [
-    "公司实行五天弹性工作制，每天工作时间不少于8小时。",
-    "每天上班给予10分钟延迟；9：40后为迟到打卡，每月最多迟到3次（不晚于10：00），超出则视为旷工；晚于10：00打卡且无正当理由，视为旷工半天。",
-    "公司考虑交通通勤情况，每天上班给予10分钟延迟。"
-]
-
-c3 = [
-    "员工请假时间小于等于2天，由直接上级、部门负责人审批，人力资源部备案；员工请假时间大于等于3天，依次由直接上级、部门负责人、公司管理层审批，人力资源部备案。",
-    "员工需要提前申请请假，具体请假时间视情况而定，一般来说，提前一天或两天向直接上级或部门负责人申请请假即可。"
-]
-
-a = [
-    "部门负责人在权限范围内有审批部门员工考勤记录的权利和严肃考勤纪律的义务，并以身作则，规范执行。",
-    "每天上班给予10分钟延迟；9：40后为迟到打卡，每月最多迟到3次（不晚于10：00），超出则视为旷工；晚于10：00打卡且无正当理由，视为旷工半天。",
-    "员工需要提前申请请假，具体请假时间视情况而定，一般来说，提前一天或两天向直接上级或部门负责人申请请假即可。"
-]
-
-c = [c1,c2,c3]
-g = [c1,c2,c3]
-
-dataset_dict = {
-    'question': q,
-    'answer': a,
-    'contexts': c,
-    'ground_truths': g
-}
-
-dataset = Dataset.from_dict(
-    {
-    "question": q,
-    "answer": a,
-    "contexts": c,
-    "ground_truths": g,
-    },
-)
-result = evaluate(dataset, metrics=m)
-print(result)
\ No newline at end of file
diff --git a/evaluation/run/run.py b/evaluation/run/run.py
new file mode 100644
index 000000000..83fe911d3
--- /dev/null
+++ b/evaluation/run/run.py
@@ -0,0 +1,48 @@
+import argparse
+import src.pkg as pkg
+from ragas import evaluate
+from datasets import load_dataset
+
+def run_evaluation():
+    parser = argparse.ArgumentParser(description='RAGAS CLI')
+    parser.add_argument("--model", type=str, default="gpt-3.5-turbo",
+                        help="Specifies the model to use for evaluation. Defaults to gpt-3.5-turbo.")
+    parser.add_argument("--api_base", type=str, default="https://api.openai.com/v1",
+                        help="Specifies the base URL for the API. Defaults to OpenAI.")
+    parser.add_argument("--api_key", type=str,
+                        help="Specifies the API key to authenticate requests.")
+    parser.add_argument("--embeddings", type=str,
+                        help="Specifies Huggingface embeddings model (or its path) to use for evaluation. Will use OpenAI embeddings if not set.")
+    parser.add_argument("--metrics", type=list, default=[],
+                        help="Specifies the metrics to use for evaluation.")
+    parser.add_argument("--dataset", type=str,
+                        help="Specifies the path to the dataset for evaluation. Will use fiqa dataset if not set.")
+
+    args = parser.parse_args()
+
+    model = args.model
+    api_base = args.api_base
+    api_key = args.api_key
+    metrics = args.metrics
+    dataset = args.dataset
+
+    judge_model = pkg.wrap_langchain_llm(model, api_base, api_key)
+
+    embeddings_model_name = args.embeddings
+
+    if embeddings_model_name:
+        embeddings = pkg.wrap_embeddings('huggingface', embeddings_model_name, None)
+    else:
+        embeddings = pkg.wrap_embeddings('openai', None, api_key)
+
+    if dataset:
+        test_set = load_dataset('csv', data_files=dataset)
+    else:
+        print('test_set not provided, using fiqa dataset')
+        fiqa = load_dataset('explodinggradients/fiqa', 'ragas_eval')
+        test_set = fiqa["baseline"].select(range(5))
+
+    ms = pkg.set_metrics(metrics, judge_model, embeddings, metrics)
+
+    return evaluate(test_set, ms)
+
diff --git a/evaluation/setup.py b/evaluation/setup.py
new file mode 100644
index 000000000..90ac07d56
--- /dev/null
+++ b/evaluation/setup.py
@@ -0,0 +1,27 @@
+# make a setup.py for evaluation package
+
+from setuptools import setup, find_packages
+
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+
+setup(
+    name="ragacli",
+    version="0.0.1",
+    author="Kielo",
+    author_email="lanture1064@gmail.com",
+    description="A one-step cli tool for RAGAS",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+    install_requires=[
+        'ragas',
+        'langchain==0.0.354'
+    ]
+)
\ No newline at end of file
diff --git a/evaluation/src/pkg.py b/evaluation/src/pkg.py
new file mode 100644
index 000000000..cf200099b
--- /dev/null
+++ b/evaluation/src/pkg.py
@@ -0,0 +1,96 @@
+import os
+from langchain.chat_models import ChatOpenAI
+from ragas.llms import RagasLLM
+from ragas.llms import LangchainLLM
+from ragas.embeddings import RagasEmbeddings
+from ragas.embeddings import OpenAIEmbeddings
+from ragas.embeddings import HuggingfaceEmbeddings
+from ragas.metrics.base import Metric
+
+from ragas.metrics import (
+    context_precision,
+    context_recall,
+    context_relevancy,
+    answer_relevancy,
+    answer_correctness,
+    answer_similarity,
+    faithfulness
+)
+
+DEFAULT_METRICS = [
+        "answer_relevancy",
+        "context_precision",
+        "faithfulness",
+        "context_recall",
+        "context_relevancy"
+    ]
+
+def wrap_langchain_llm(
+    model: str,
+    api_base: str | None,
+    api_key: str | None
+) -> LangchainLLM:
+    if api_base is None:
+        print('api_base not provided, assuming OpenAI default')
+        api_base = 'https://api.openai.com/v1'
+        os.environ["OPENAI_API_KEY"] = api_key
+        if api_key is None:
+            raise ValueError("api_key must be provided")
+        base = ChatOpenAI(model_name=model)
+    else:
+        os.environ["OPENAI_API_KEY"] = api_key
+        os.environ["OPENAI_API_BASE"] = api_base
+        base = ChatOpenAI(
+            model_name=model,
+            openai_api_key=api_key,
+            openai_api_base=api_base
+        )
+    return LangchainLLM(llm=base)
+
+
+def set_metrics(
+    metrics: list[str],
+    llm: RagasLLM | None,
+    embeddings: RagasEmbeddings | None
+) -> list[Metric]:
+    ms = []
+    if llm:
+        context_precision.llm = llm
+        context_recall.llm = llm
+        context_relevancy.llm = llm
+        answer_correctness.llm = llm
+        answer_similarity.llm = llm
+        faithfulness.llm = llm
+    if embeddings:
+        answer_relevancy.embeddings = embeddings
+        answer_correctness.embeddings = embeddings
+    if not metrics:
+        metrics = DEFAULT_METRICS
+    for m in metrics:
+        if m == 'context_precision':
+            ms.append(context_precision)
+        elif m == 'context_recall':
+            ms.append(context_recall)
+        elif m == 'context_relevancy':
+            ms.append(context_relevancy)
+        elif m == 'answer_relevancy':
+            ms.append(answer_relevancy)
+        elif m == 'answer_correctness':
+            ms.append(answer_correctness)
+        elif m == 'answer_similarity':
+            ms.append(answer_similarity)
+        elif m == 'faithfulness':
+            ms.append(faithfulness)
+    return ms
+
+def wrap_embeddings(
+    model_type: str,
+    model_name: str | None,
+    api_key: str | None
+) -> RagasEmbeddings:
+    if model_type == 'openai':
+        return OpenAIEmbeddings(api_key=api_key)
+    elif model_type == 'huggingface':
+        return HuggingfaceEmbeddings(model_name=model_name)
+    else:
+        raise ValueError(f"Invalid model type: {model_type}")
\ No newline at end of file