ServiceNow · oriyor · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -370,3 +370,42 @@ jobs:
         run: |
           pytest -n 5 --durations=10 -m 'slow and not pricy and not serial' --slowmo 1000 -v tests/visualwebarena
           pytest --durations=10 -m 'slow and not pricy and serial' --slowmo 1000 -v tests/visualwebarena
+
+  browsergym-assistantbench:
+    runs-on: ubuntu-22.04
+
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '>=3.10'
+          cache: 'pip' # caching pip dependencies
+
+      - name: Pip install
+        working-directory: ./dev
+        run: pip install -r requirements.txt
+
+      - name: Pip list
+        run: pip list
+
+      - name: Install Playwright
+        run: playwright install chromium --with-deps
+
+      - name: Run browsergym-assistantbench Unit Tests
+        env:
+          VWA_CLASSIFIEDS: "${{ vars.VWA_CLASSIFIEDS }}"
+          VWA_CLASSIFIEDS_RESET_TOKEN: "${{ vars.VWA_CLASSIFIEDS_RESET_TOKEN }}"
+          VWA_SHOPPING: "${{ vars.VWA_SHOPPING }}"
+          VWA_REDDIT: "${{ vars.VWA_REDDIT }}"
+          VWA_WIKIPEDIA: "${{ vars.VWA_WIKIPEDIA }}"
+          VWA_HOMEPAGE: "${{ vars.VWA_HOMEPAGE }}"
+          OPENAI_API_KEY: ""
+        run: |
+          pytest -n 5 --durations=10 -m 'not pricy' --slowmo 1000 -v tests/assistantbench
diff --git a/.gitignore b/.gitignore
@@ -140,4 +140,7 @@ tests/results
 tmp.py
 .vscode/settings.json
 
+# demo and results
 results/
+
+.vscode/launch.json
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 install:
 	@echo "--- 🚀 Installing project dependencies ---"
-	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/
+	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
 	playwright install chromium --with-deps
 
 install-demo:

diff --git a/README.md b/README.md
@@ -16,14 +16,21 @@ BrowserGym includes the following benchmarks by default:
  - [WebArena](https://webarena.dev/)
  - [VisualWebArena](https://jykoh.com/vwa)
  - [WorkArena](https://github.com/ServiceNow/WorkArena)
+ - [AssistantBench](https://github.com/oriyor/assistantbench)
 
 Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
 
 ## Setup
 
-To install browsergym, you can either install one of the `browsergym-miniwob`, `browsergym-webarena`, `browsergym-visualwebarena` and `browsergym-workarena` packages, or you can simply install `browsergym` which includes all of these by default.
+To install browsergym, you can install one of the following packages.
 ```sh
-pip install browsergym
+pip install browsergym  # (recommended) everything below
+pip install browsergym-core  # core functionalities only (no benchmark, just the openended task)
+pip install browsergym-miniwob  # core + miniwob
+pip install browsergym-webarena  # core + webarena
+pip install browsergym-visualwebarena  # core + visualwebarena
+pip install browsergym-workarena  # core + workarena
+pip install browsergym-assistantbench  # core + assistantbench
 ```
 
 Then, a required step is to setup playwright by running

diff --git a/browsergym/assistantbench/README.md b/browsergym/assistantbench/README.md
@@ -0,0 +1,21 @@
+# AssistantBench <> BrowserGym
+
+This package provides an implementation for using the [AssistantBench](https://assistantbench.github.io/) benchmark in BrowserGym.
+
+Because AssistantBench includes open-ended tasks, setup is extremely easy and simply requires installing the package.
+
+Please note that AssistantBench has a hidden test set, so test set predictions will need to be uploaded to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
+
+## Setting up
+
+- Install the package (this is still a wip)
+```
+pip install browsergym-assistantbench
+```
+
+- Run inference, e.g., run the following commands for demo on a simple toy task
+```
+python demo_agent/run_demo.py --task_name ab.imp.0
+```
+
+- Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
diff --git a/browsergym/assistantbench/pyproject.toml b/browsergym/assistantbench/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+
+[project]
+name = "browsergym-assistantbench"
+description = "AssistantBench benchmark for BrowserGym"
+authors = [
+    {name = "Ori Yoran"},
+    {name = "Maxime Gasse"},
+]
+readme = "README.md"
+requires-python = ">3.7"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+]
+dynamic = ["dependencies", "version"]
+
+[project.urls]
+homepage = "https://github.com/ServiceNow/BrowserGym"
+
+[tool.hatch.version]
+path = "../core/src/browsergym/core/__init__.py"
+
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/browsergym"]
diff --git a/browsergym/assistantbench/requirements.txt b/browsergym/assistantbench/requirements.txt
@@ -0,0 +1,4 @@
+browsergym-core==0.8.1
+datasets
+scipy
+numpy
diff --git a/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py b/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py
@@ -0,0 +1,47 @@
+from browsergym.core.registration import register_task
+
+from . import task
+
+ALL_AB_TASK_IDS = []
+
+ALL_TOY_AB_TASK_IDS = []
+ALL_DEV_AB_TASK_IDS = []
+ALL_TEST_AB_TASK_IDS = []
+
+# register a toy easy task for testing implemenation
+gym_id = f"ab.imp.0"
+register_task(
+    gym_id,
+    task.AssistantBenchTask,
+    task_kwargs={
+        "task_id": f"imp.0",
+        "output_file_path": "./assistantbench-predictions-imp.jsonl",
+    },
+)
+ALL_AB_TASK_IDS.append(gym_id)
+ALL_TOY_AB_TASK_IDS.append(gym_id)
+
+# register the AssistantBench dev set
+for task_id in range(33):
+    gym_id = f"ab.{task_id}"
+    register_task(
+        gym_id,
+        task.AssistantBenchTask,
+        task_kwargs={"task_id": f"{task_id}"},
+    )
+    ALL_AB_TASK_IDS.append(gym_id)
+    ALL_DEV_AB_TASK_IDS.append(gym_id)
+
+# register the AssistantBench test set
+for task_id in range(181):
+    gym_id = f"ab.test.{task_id}"
+    register_task(
+        gym_id,
+        task.AssistantBenchTask,
+        task_kwargs={
+            "task_id": f"test.{task_id}",
+            "output_file_path": "./assistantbench-predictions-test.jsonl",
+        },
+    )
+    ALL_AB_TASK_IDS.append(gym_id)
+    ALL_TEST_AB_TASK_IDS.append(gym_id)
diff --git a/.../assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py b/.../assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py
@@ -0,0 +1,68 @@
+from typing import Dict, List
+
+import numpy as np
+
+from .utils import _align_bags
+
+
+def calculate_f1_score(precision, recall):
+    if precision + recall == 0:
+        return 0  # Handle the case to avoid division by zero
+    return 2 * (precision * recall) / (precision + recall)
+
+
+def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
+    from .evaluate_factory import get_evaluator_from_gold_answer
+
+    recall = []
+    for gold_key, gold_value in gold.items():
+        pred_value = pred.get(gold_key)
+        gold_value = fix_number(gold_value)
+        pred_value = fix_number(pred_value)
+        if gold_key not in pred:
+            recall.append(0)
+        else:
+            evaluator = (
+                get_evaluator_from_gold_answer(type(gold_value))
+                if use_gold_for_eval
+                else get_evaluator_from_gold_answer(type(pred_value))
+            )
+            if type(pred_value) != type(gold_value):
+                recall.append(0)
+                continue
+            recall.append(evaluator(pred_value, gold_value))
+    avg_recall = np.average(recall)
+    return avg_recall
+
+
+def fix_number(number):
+
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(
+            " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
+        ).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".")
+        try:
+            return float(copy_ans)
+        except:
+            return number
+    elif type(number) == int:
+        return float(number)
+    else:
+        return number
+
+
+def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
+    recall = calc_recall(pred, gold, True)
+    precision = calc_recall(gold, pred, False)
+    f1 = calculate_f1_score(precision, recall)
+    return f1
+
+
+def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
+    if not (type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict)):
+        return 0
+    max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
+    return np.average(max_alignment_scores)
diff --git a/...ssistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py b/...ssistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py
@@ -0,0 +1,28 @@
+from typing import Union
+
+from .evaluate_dicts import evaluate_dicts
+from .evaluate_numbers import evaluate_numbers
+from .evaluate_strings import evaluate_strings
+
+EvaluatorFactory = {
+    "string": evaluate_strings,
+    "number": evaluate_numbers,
+    "json": evaluate_dicts,
+    "string list": evaluate_strings,
+}
+
+EvaluatorFactoryFromType = {
+    str: evaluate_strings,
+    int: evaluate_numbers,
+    float: evaluate_numbers,
+    bool: evaluate_strings,
+    list: evaluate_strings,
+}
+
+
+def get_evaluator(evaluator: str):
+    return EvaluatorFactory[evaluator]
+
+
+def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
+    return EvaluatorFactoryFromType[gold_answer]
diff --git a/...ssistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py b/...ssistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py
@@ -0,0 +1,34 @@
+from typing import Union
+
+import numpy as np
+
+
+# Renamed calc_z function to distance_function_log
+def distance_function_log(pred: float, gold: float):
+    if pred == gold == 0:
+        return 1
+    if pred == 0:
+        pred = 1e-4
+    if gold == 0:
+        gold = 1e-4
+    if pred > gold:
+        return max(0, 1 - np.log(pred / gold))
+    else:
+        return max(0, 1 - np.log(gold / pred))
+
+
+def evaluate_numbers(pred: Union[float, str], gold: float):
+    res = None
+    if type(pred) != float and type(pred) != int:
+        try:
+            pred = float(pred)
+        except ValueError:
+            res = 0
+    if type(gold) != float and type(gold) != int:
+        try:
+            gold = float(gold)
+        except ValueError:
+            res = 0
+    if res is None:
+        res = distance_function_log(pred, gold)
+    return res