From f73a76ce64b233ee8cfa068200a3a2ddf118c61a Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Wed, 27 Sep 2023 00:24:48 +0200 Subject: [PATCH] prompt engineering --- docs/source/usage.rst | 14 +- experiments/run_experiments_anthropic.py | 98 +++++ experiments/run_experiments_hugginface.py | 17 + experiments/run_experiments_openai.py | 6 + src/chemlift/icl/fewshotclassifier.py | 9 +- src/chemlift/icl/fewshotpredictor.py | 10 +- src/chemlift/peftmodels.py | 425 ---------------------- 7 files changed, 148 insertions(+), 431 deletions(-) create mode 100644 experiments/run_experiments_anthropic.py create mode 100644 experiments/run_experiments_hugginface.py create mode 100644 experiments/run_experiments_openai.py delete mode 100644 src/chemlift/peftmodels.py diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 2c5006e..e535ef2 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -60,4 +60,16 @@ from langchain.llms import OpenAI classifier = FewShotClassifier(LangChainChatModelWrapper(ChatAnthropic())) # or classifier = FewShotClassifier(OpenAI()) classifier.fit(X, y) -``` \ No newline at end of file +classifier.predict(X) +``` + +Note that the logic is built such that if the number of extracted outputs is not equal to the number query points, we will return :code:`None` +as prediction for all query points. This is the case because with the current fixed prompt setup, we cannot unambiguously assign the outputs to the query points. + +Classification +---------------- + + + +Regression +-------------- \ No newline at end of file diff --git a/experiments/run_experiments_anthropic.py b/experiments/run_experiments_anthropic.py new file mode 100644 index 0000000..9500e07 --- /dev/null +++ b/experiments/run_experiments_anthropic.py @@ -0,0 +1,98 @@ +from chemlift.icl.utils import LangChainChatModelWrapper +from langchain.chat_models import ChatAnthropic +from chemlift.icl.fewshotclassifier import FewShotClassifier +from chemlift.icl.fewshotpredictor import Strategy +from gptchem.data import get_photoswitch_data +from sklearn.model_selection import train_test_split +from gptchem.evaluator import evaluate_classification +import time +from fastcore.xtras import save_pickle, load_pickle +import os +import dotenv + +dotenv.load_dotenv("../.env", override=True) + +number_support_samples = [5, 10, 20, 50, 100] +strategies = [Strategy.RANDOM, Strategy.DIVERSE] +anthropic_modes = ["claude-instant-1", "claude-2"] + + +def get_timestr(): + return time.strftime("%Y-%m-%d_%H-%M-%S") + + +def train_test( + num_support_samples, + strategy, + model, + num_test_points, + random_state=42, + temperature=0.8, + max_test=5, +): + llm = LangChainChatModelWrapper(ChatAnthropic(model=model, temperature=temperature)) + + classifier = FewShotClassifier( + llm, + property_name="class of the transition wavelength", + n_support=num_support_samples, + strategy=strategy, + seed=random_state, + prefix="You are an expert chemist. ", + max_test=max_test, + ) + + data = get_photoswitch_data() + data = data.dropna(subset=["SMILES", "E isomer pi-pi* wavelength in nm"]) + + data["label"] = data["E isomer pi-pi* wavelength in nm"].apply( + lambda x: 1 if x > data["E isomer pi-pi* wavelength in nm"].median() else 0 + ) + + data_train, data_test = train_test_split( + data, test_size=num_test_points, stratify=data["label"], random_state=random_state + ) + + classifier.fit(data_train["SMILES"].values, data_train["label"].values) + predictions = classifier.predict(data_test["SMILES"].values) + + report = evaluate_classification(data_test["label"].values, predictions) + + report["num_support_samples"] = num_support_samples + report["strategy"] = strategy.value + report["model"] = model + report["num_test_points"] = num_test_points + report["random_state"] = random_state + + report["predictions"] = predictions + report["targets"] = data_test["label"].values + report["max_test"] = max_test + report["temperature"] = temperature + + if not os.path.exists("results"): + os.makedirs("results") + + save_pickle(f"results/{get_timestr()}_anthropic_report.pkl", report) + print(report) + + +if __name__ == "__main__": + for seed in range(5): + for num_support_samples in number_support_samples: + for strategy in strategies: + for anthropic_mode in anthropic_modes: + for num_test_points in [50]: + for temperature in [0.2, 0.8]: + for max_test in [1, 5, 10]: + try: + train_test( + num_support_samples, + strategy, + anthropic_mode, + num_test_points, + random_state=seed, + temperature=temperature, + max_test=max_test, + ) + except Exception as e: + print(e) diff --git a/experiments/run_experiments_hugginface.py b/experiments/run_experiments_hugginface.py new file mode 100644 index 0000000..bfc63c8 --- /dev/null +++ b/experiments/run_experiments_hugginface.py @@ -0,0 +1,17 @@ +# from langchain import HuggingFaceHub +# llm = HuggingFaceHub(repo_id = ) + + +models = [3 + "google/flan-t5-xl", + "bigscience/bloom", + "EleutherAI/pythia-70m-deduped", + "EleutherAI/pythia-160m-deduped", + "EleutherAI/pythia-410m-deduped", + "EleutherAI/pythia-1b-deduped", + "EleutherAI/pythia-2.8b-deduped", + "EleutherAI/pythia-6.9b-deduped", + "EleutherAI/pythia-12b-deduped", +] + + diff --git a/experiments/run_experiments_openai.py b/experiments/run_experiments_openai.py new file mode 100644 index 0000000..92522a8 --- /dev/null +++ b/experiments/run_experiments_openai.py @@ -0,0 +1,6 @@ +from gptchem.data import get_photoswitch_data +from gptchem.evaluator import evaluate_classication + +from sklearn.model_selection import train_test_split + +openai_models = ["text-ada-001", "text-davinci-003", "gpt-4", "gpt-3.5-turbo"] diff --git a/src/chemlift/icl/fewshotclassifier.py b/src/chemlift/icl/fewshotclassifier.py index 31931ca..925d6db 100644 --- a/src/chemlift/icl/fewshotclassifier.py +++ b/src/chemlift/icl/fewshotclassifier.py @@ -9,9 +9,14 @@ class FewShotClassifier(FewShotPredictor): def _extract(self, generations, expected_len): generations = sum( - [g[0].text.replace("Answer: ", "").strip().split(",") for g in generations.generations], + [ + g[0].text.split(":")[-1].replace("Answer: ", "").strip().split(",") + for generation in generations + for g in generation.generations + ], [], ) + print(generations, len(generations)) if len(generations) != expected_len: logger.warning(f"Expected {expected_len} generations, got {len(generations)}") return [None] * expected_len @@ -29,4 +34,4 @@ def _extract(self, generations, expected_len): def predict(self, X: ArrayLike, generation_kwargs: dict = {}): generations = self._predict(X, generation_kwargs) - return self._extract(generations[0], expected_len=len(X)) + return self._extract(generations, expected_len=len(X)) diff --git a/src/chemlift/icl/fewshotpredictor.py b/src/chemlift/icl/fewshotpredictor.py index 71e3e1d..c4b20d6 100644 --- a/src/chemlift/icl/fewshotpredictor.py +++ b/src/chemlift/icl/fewshotpredictor.py @@ -20,11 +20,13 @@ class FewShotPredictor: Examples: {examples} +Constraint: Make sure to return exactly {number} comma separated predictions. The predictions should be one of {allowed_values}. Return only the predictions. + Answer: """ template_single = """{prefix}What is {property_name} of {query} given the examples below? -Answer concise by only the prediction on a new line, which is one of {allowed_values}. +Answer concise by only returing the prediction, which should be one of {allowed_values}. Examples: {examples} @@ -43,6 +45,7 @@ def __init__( strategy: Strategy = Strategy.RANDOM, seed: int = 42, prefix: str = "You are an expert chemist. ", + max_test: int = 5, ): self._support_set = None self._llm = llm @@ -52,7 +55,7 @@ def __init__( self._property_name = property_name self._allowed_values = None self._materialclass = "molecules" - self._max_test = 10 + self._max_test = max_test self._prefix = prefix def _format_examples(self, examples, targets): @@ -126,6 +129,7 @@ def _predict(self, X: ArrayLike, generation_kwargs: dict = {}): number=len(chunk), materialclass=self._materialclass, prefix=self._prefix, + allowed_values=", ".join(map(str, list(self._allowed_values))), ) else: examples = self._format_examples(support_examples, support_targets) @@ -135,7 +139,7 @@ def _predict(self, X: ArrayLike, generation_kwargs: dict = {}): property_name=self._property_name, query=queries, examples=examples, - allowed_values=allowed_values, + allowed_values=", ".join(map(str, list(self._allowed_values))), prefix=self._prefix, ) diff --git a/src/chemlift/peftmodels.py b/src/chemlift/peftmodels.py deleted file mode 100644 index d4b2859..0000000 --- a/src/chemlift/peftmodels.py +++ /dev/null @@ -1,425 +0,0 @@ -from typing import List, Optional, Union -from copy import deepcopy -import numpy as np -import pandas as pd -import torch -from gptchem.extractor import ClassificationExtractor -from gptchem.formatter import ClassificationFormatter -from gptchem.gpt_classifier import GPTClassifier -from gptchem.tuner import Tuner -from more_itertools import chunked -from numpy.typing import ArrayLike -from tqdm import tqdm - -from chemlift.finetune.peft_transformers import load_model, train_model, complete, tokenize -from chemlift.utils import ( - get_mode, - try_exccept_nan, - augment_smiles, -) -from transformers.utils import logging -from functools import partial -from peft.utils.save_and_load import set_peft_model_state_dict - - -class ChemLIFTClassifierFactory: - def __init__(self, model_name: str, **kwargs): - self.model_name = model_name - self.kwargs = kwargs - - def create_model(self): - if "openai" in self.model_name: - tuner = Tuner(**self.kwargs) - return GPTClassifier(self.model_name, tuner=tuner, **self.kwargs) - else: - return PEFTClassifier(self.model_name, **self.kwargs) - - def __call__(self): - return self.create_model() - - -class PEFTClassifier(GPTClassifier): - def __init__( - self, - property_name: str, - extractor: ClassificationExtractor = ClassificationExtractor(), - batch_size: int = 64, - tune_settings: Optional[dict] = None, - inference_batch_size: int = 64, - formatter: Optional[ClassificationFormatter] = None, - representation_names: Optional[List[str]] = None, - base_model: str = "EleutherAI/gpt-j-6b", - load_in_8bit: bool = True, - lora_kwargs: dict = {}, - tokenizer_kwargs: dict = {}, - ): - self.property_name = property_name - self.extractor = extractor - self.batch_size = batch_size - self.tune_settings = tune_settings or {} - self.inference_batch_size = inference_batch_size - - self.formatter = ( - ClassificationFormatter( - representation_column="repr", - label_column="prop", - property_name=property_name, - num_classes=None, - ) - if formatter is None - else formatter - ) - self.model, self.tokenizer = load_model( - base_model=base_model, load_in_8bit=load_in_8bit, lora_kwargs=lora_kwargs - ) - self.representation_names = representation_names if representation_names else [] - self.tokenizer_kwargs = tokenizer_kwargs - if "cutoff_len" not in self.tokenizer_kwargs: - self.tokenizer_kwargs["cutoff_len"] = 1024 - - self.tune_settings["per_device_train_batch_size"] = self.batch_size - - def _prepare_df(self, X: ArrayLike, y: ArrayLike): - rows = [] - for i in range(len(X)): - rows.append({"repr": X[i], "prop": y[i]}) - return pd.DataFrame(rows) - - def return_embeddings( - self, - X: ArrayLike, - layers: Optional[Union[int, List[int]]] = -1, - padding: bool = True, - truncation: bool = True, - insert_in_template: bool = True, - ): - """Return embeddings for a set of molecular representations. - - Args: - X (ArrayLike): Input data (typically array of molecular representations) - layers (Optional[Union[int, List[int]]], optional): Layers to return embeddings from. - Defaults to -1. - padding (bool, optional): Whether to pad the input. - Defaults to True. - truncation (bool, optional): Whether to truncate the input. - Defaults to True. - insert_in_template (bool, optional): Whether to insert the input in the template. - Defaults to True. - - Returns: - ArrayLike: Embeddings - """ - if insert_in_template: - X = np.array(X) - if X.ndim == 1 or (X.ndim == 2 and X.size == len(X)): - df = self._prepare_df(X, [0] * len(X)) - formatted = self.formatter(df) - elif X.ndim == 2 and X.size > len(X): - if not len(self.representation_names) == X.shape[1]: - raise ValueError( - "Number of representation names must match number of dimensions" - ) - - dfs = [] - for i in range(X.shape[1]): - formatter = deepcopy(self.formatter) - formatter.representation_name = self.representation_names[i] - df = self._prepare_df(X[:, i], [0] * len(X)) - formatted = formatter(df) - dfs.append(formatted) - - formatted = pd.concat(dfs) - prompt_text = formatted["prompt"].to_list() - else: - prompt_text = X - - embeddings = [] - - with torch.no_grad(): - for chunk in tqdm( - chunked(range(len(prompt_text)), self.inference_batch_size), - total=len(prompt_text) // self.inference_batch_size, - ): - batch = [prompt_text[i] for i in chunk] - - tokenize_partial = partial( - tokenize, - tokenizer=self.tokenizer, - cutoff_len=1024, - return_tensors="pt", - padding=padding, - truncation=truncation, - ) - prompt = tokenize_partial(batch) - outs = self.model.forward(prompt["input_ids"], output_hidden_states=True) - if isinstance(layers, int): - embeddings.append(outs.hidden_states[layers].cpu().numpy()) - else: - embeddings.append([outs.hidden_states[i].cpu().numpy() for i in layers]) - # flatten the batch dim - embeddings = np.concatenate(embeddings, axis=0) - - return embeddings - - def load_state_dict(self, checkpoint_path: str): - """Load model from checkpoint. - - Args: - checkpoint_path (str): Path to checkpoint - """ - set_peft_model_state_dict(self.model, torch.load(checkpoint_path)) - - def fit( - self, - X: Optional[ArrayLike] = None, - y: Optional[ArrayLike] = None, - formatted: Optional[pd.DataFrame] = None, - ) -> None: - """Fine tune a GPT-3 model on a dataset. - - Args: - X (ArrayLike): Input data (typically array of molecular representations) - y (ArrayLike): Target data (typically array of property values) - formatted (pd.DataFrame): Formatted data (typically output of `formatter`) - """ - if formatted is None: - if X is None or y is None: - raise ValueError("Either formatted data or X and y must be provided.") - - X = np.array(X) - y = np.array(y) - if formatted is None: - if X.ndim == 1 or (X.ndim == 2 and X.size == len(X)): - df = self._prepare_df(X, y) - formatted = self.formatter(df) - elif X.ndim == 2 and X.size > len(X): - if not len(self.representation_names) == X.shape[1]: - raise ValueError( - "Number of representation names must match number of dimensions" - ) - - dfs = [] - for i in range(X.ndim): - formatter = deepcopy(self.formatter) - formatter.representation_name = self.representation_names[i] - df = self._prepare_df(X[:, i], y) - formatted = formatter(df) - dfs.append(formatted) - - formatted = pd.concat(dfs) - train_model( - self.model, - self.tokenizer, - formatted[["prompt", "completion"]], - train_kwargs=self.tune_settings, - hub_model_name=None, - report_to=None, - ) - - def _predict( - self, - X: Optional[ArrayLike] = None, - temperature=0.0, - do_sample=False, - formatted: Optional[pd.DataFrame] = None, - ) -> ArrayLike: - """Predict property values for a set of molecular representations. - - Args: - X (ArrayLike): Input data (typically array of molecular representations) - temperature (float, optional): Temperature for sampling. Defaults to 0.7. - do_sample (bool, optional): Whether to sample or not. Defaults to False. - formatted (pd.DataFrame, optional): Formatted data (typically output of `formatter`). - Defaults to None. If None, X must be provided. - - Returns: - ArrayLike: Predicted property values - """ - - if formatted is None: - if X is None: - raise ValueError("Either formatted data or X must be provided.") - - if formatted is None: - if X.ndim == 1 or (X.ndim == 2 and X.size == len(X)): - # if pandas df or series is passed, convert to numpy array - if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): - X = X.to_numpy() - df = self._prepare_df(X, [0] * len(X)) - formatted = self.formatter(df) - dfs = [formatted] - elif X.ndim == 2 and X.size > len(X): - if not len(self.representation_names) == X.shape[1]: - raise ValueError( - "Number of representation names must match number of dimensions" - ) - - dfs = [] - for i in range(X.shape[1]): - formatter = deepcopy(self.formatter) - formatter.representation_name = self.representation_names[i] - df = self._prepare_df(X[:, i], [0] * len(X)) - formatted = formatter(df) - dfs.append(formatted) - - else: - dfs = [formatted] - - predictions = [] - for df in dfs: - predictions.append(self._query(df, temperature=temperature, do_sample=do_sample)) - - return predictions - - def predict( - self, - X: Optional[ArrayLike] = None, - temperature=0.7, - do_sample=False, - formatted: Optional[pd.DataFrame] = None, - return_std: bool = True, - ): - predictions = self._predict( - X=X, temperature=temperature, do_sample=do_sample, formatted=formatted - ) - - predictions = np.array(predictions).T - - # nan values make issues here - predictions_mode = np.array( - [try_exccept_nan(get_mode, pred) for pred in predictions.astype(int)] - ) - - if return_std: - predictions_std = np.array([np.std(pred) for pred in predictions.astype(int)]) - return predictions_mode, predictions_std - return predictions_mode - - def _query(self, formatted_df, temperature, do_sample): - if temperature > 0 and not do_sample: - logger = logging.get_logger("transformers") - logger.warning( - "Temperature > 0 but do_sample is False. This will result in deterministic predictions. Set do_sample=True to sample from the distribution." - ) - completions = complete( - self.model, - self.tokenizer, - prompt_text=formatted_df["prompt"].to_list(), - max_length=self.tokenizer_kwargs["cutoff_len"], - do_sample=do_sample, - temperature=temperature, - batch_size=self.inference_batch_size, - ) - - completions = [c["decoded"] for c in completions] - - extracted = [ - self.extractor.extract(completions[i].split("###")[1]) - for i in range( - len(completions) - ) # ToDo: Make it possible to use other splitters than ### - ] - - filtered = [v if v is not None else np.nan for v in extracted] - - return filtered - - -class SMILESAugmentedPEFTClassifier(PEFTClassifier): - def fit( - self, - X: Optional[ArrayLike] = None, - y: Optional[ArrayLike] = None, - augmentation_rounds: int = 10, - deduplicate: bool = True, - include_original: bool = True, - ) -> None: - """Fine tune a GPT-3 model on a dataset. - - Args: - X (ArrayLike): Input data (typically array of molecular representations) - y (ArrayLike): Target data (typically array of property values) - augmentation_rounds (int): Number of rounds of augmentation to perform - deduplicate (bool): Whether to deduplicate the augmented data - include_original (bool): Whether to include the original data in the training set - """ - x_augmented = [] - y_augmented = [] - - if augmentation_rounds > 1: - for smiles, label in zip(X, y): - augmented = augment_smiles( - smiles, int_aug=augmentation_rounds, deduplicate=deduplicate - ) - y_augmented.extend([label] * len(augmented)) - x_augmented.extend(augmented) - else: - x_augmented = X - y_augmented = y - - if include_original: - x_augmented.extend(X) - y_augmented.extend(y) - - # shuffle - x_augmented = np.array(x_augmented) - y_augmented = np.array(y_augmented) - idx = np.random.permutation(len(x_augmented)) - x_augmented = x_augmented[idx] - y_augmented = y_augmented[idx] - - super().fit(X=x_augmented, y=y_augmented) - - def _predict( - self, - X: Optional[ArrayLike] = None, - temperature=0.7, - do_sample=False, - augmentation_rounds: int = 0, - deduplicate: bool = True, - include_original: bool = True, - ): - # we need to also keep track of canonical smiles to be able to aggregate: - compiled_predictions = [] - - for smiles in X: - if augmentation_rounds > 1: - augmented = augment_smiles( - smiles, int_aug=augmentation_rounds, deduplicate=deduplicate - ) - if include_original: - augmented.append(smiles) - else: - augmented = [smiles] - augmented = np.array(augmented) - predictions = super()._predict( - X=augmented, temperature=temperature, do_sample=do_sample - )[0] - compiled_predictions.append(predictions) - - return compiled_predictions - - def predict( - self, - X: Optional[ArrayLike] = None, - temperature=0.7, - do_sample=False, - augmentation_rounds: int = 0, - deduplicate: bool = True, - include_original: bool = True, - ): - predictions = self._predict( - X=X, - temperature=temperature, - do_sample=do_sample, - augmentation_rounds=augmentation_rounds, - deduplicate=deduplicate, - include_original=include_original, - ) - - # nan values make issues here - predictions_mode = np.array( - [try_exccept_nan(get_mode, np.array(pred).astype(int)) for pred in predictions] - ) - predictions_std = np.array([np.std(pred) for pred in predictions]) - return predictions_mode, predictions_std