From 3eb727f2a9b48cc6828f2373a2e377ab718998a5 Mon Sep 17 00:00:00 2001 From: artitw Date: Tue, 8 Oct 2024 04:24:26 +0000 Subject: [PATCH] Fix RAG Assistant --- setup.py | 3 ++- text2text/rag_assistant.py | 24 +++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index a152d75..69cc767 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="text2text", - version="1.7.2", + version="1.7.3", author="artitw", author_email="artitw@gmail.com", description="Text2Text: Crosslingual NLP/G toolkit", @@ -22,6 +22,7 @@ install_requires=[ 'faiss-cpu', 'flask', + 'beautifulsoup4', 'googledrivedownloader', 'llama-index-llms-ollama', 'ollama', diff --git a/text2text/rag_assistant.py b/text2text/rag_assistant.py index 1cc0ccf..805d141 100644 --- a/text2text/rag_assistant.py +++ b/text2text/rag_assistant.py @@ -1,9 +1,22 @@ import text2text as t2t +import requests +import warnings import urllib.parse -import urllib.request -import warnings +from bs4 import BeautifulSoup + +def get_cleaned_html(url): + r = requests.get(url) + soup = BeautifulSoup(r.text, 'html.parser') + + # Remove unwanted tags + for script in soup(['script', 'style']): + script.decompose() + + cleaned_text = soup.get_text(separator=' ', strip=True) + + return cleaned_text def is_valid_url(url): try: @@ -22,8 +35,7 @@ def __init__(self, **kwargs): for u in urls: if is_valid_url(u): try: - with urllib.request.urlopen(u) as f: - texts.append(f.read()) + texts.append(get_cleaned_html(u)) except Exception as e: warnings.warn(f"Skipping URL with errors: {u}") else: @@ -31,7 +43,9 @@ def __init__(self, **kwargs): if schema: for t in texts: - res = t2t.Assistant.chat_completion(self, [{"role": "user", "content": t}], schema=schema) + fields = ", ".join(schema.model_fields.keys()) + prompt = f'Extract {fields} from the following text:\n\n{t}' + res = t2t.Assistant.chat_completion(self, [{"role": "user", "content": prompt}], schema=schema) res = "\n".join(f'{k}: {v}' for k,v in vars(res).items()) input_lines.append(res) else: