From 3dea55bc8fef4cd9d06b6a87a4234fc695fd5b31 Mon Sep 17 00:00:00 2001 From: Ashad Qureshi Date: Sat, 27 Jul 2024 00:23:33 +0500 Subject: [PATCH] restructuring v1 --- backend/requirements.txt | 131 ---------------------------------- backend/src/__init__.py | 0 backend/src/data_ingestion.py | 64 ----------------- backend/src/file_search.py | 68 ------------------ backend/src/get_tools.py | 27 ------- backend/src/webpage_search.py | 86 ---------------------- 6 files changed, 376 deletions(-) delete mode 100644 backend/src/__init__.py delete mode 100644 backend/src/data_ingestion.py delete mode 100644 backend/src/file_search.py delete mode 100644 backend/src/get_tools.py delete mode 100644 backend/src/webpage_search.py diff --git a/backend/requirements.txt b/backend/requirements.txt index 3b36932..e69de29 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,131 +0,0 @@ -aiofiles==23.2.1 -aiohttp==3.9.5 -aiosignal==1.3.1 -altair==5.3.0 -annotated-types==0.7.0 -anyio==4.4.0 -attrs==23.2.0 -beautifulsoup4==4.12.3 -cachetools==5.3.3 -certifi==2024.7.4 -charset-normalizer==3.3.2 -click==8.1.7 -colorama==0.4.6 -contourpy==1.2.1 -cycler==0.12.1 -dataclasses-json==0.6.7 -Deprecated==1.2.14 -dirtyjson==1.0.8 -distro==1.9.0 -dnspython==2.6.1 -email_validator==2.2.0 -fastapi==0.111.0 -fastapi-cli==0.0.4 -ffmpy==0.3.2 -filelock==3.15.4 -fonttools==4.53.0 -frozenlist==1.4.1 -fsspec==2024.6.1 -google-ai-generativelanguage==0.6.4 -google-api-core==2.19.1 -google-api-python-client==2.137.0 -google-auth==2.32.0 -google-auth-httplib2==0.2.0 -google-generativeai==0.5.4 -googleapis-common-protos==1.63.2 -greenlet==3.0.3 -grpcio==1.65.0 -grpcio-status==1.62.2 -h11==0.14.0 -httpcore==1.0.5 -httplib2==0.22.0 -httptools==0.6.1 -httpx==0.27.0 -huggingface-hub==0.23.4 -idna==3.7 -importlib_resources==6.4.0 -Jinja2==3.1.4 -joblib==1.4.2 -jsonschema==4.22.0 -jsonschema-specifications==2023.12.1 -kiwisolver==1.4.5 -llama-cloud==0.0.6 -llama-index==0.10.51 -llama-index-agent-openai==0.2.7 -llama-index-cli==0.1.12 -llama-index-core==0.10.51 -llama-index-embeddings-jinaai==0.2.0 -llama-index-embeddings-openai==0.1.10 -llama-index-indices-managed-llama-cloud==0.2.2 -llama-index-legacy==0.9.48 -llama-index-llms-gemini==0.1.11 -llama-index-llms-openai==0.1.24 -llama-index-multi-modal-llms-openai==0.1.6 -llama-index-program-openai==0.1.6 -llama-index-question-gen-openai==0.1.3 -llama-index-readers-file==0.1.25 -llama-index-readers-llama-parse==0.1.4 -llama-parse==0.4.4 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -marshmallow==3.21.3 -matplotlib==3.9.0 -mdurl==0.1.2 -multidict==6.0.5 -mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -networkx==3.3 -nltk==3.8.1 -numpy==1.26.4 -openai==1.35.7 -orjson==3.10.5 -packaging==24.1 -pandas==2.2.2 -pillow==10.3.0 -proto-plus==1.24.0 -protobuf==4.25.3 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pydantic==2.7.4 -pydantic_core==2.18.4 -pydub==0.25.1 -Pygments==2.18.0 -pyparsing==3.1.2 -pypdf==4.2.0 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -python-multipart==0.0.9 -pytz==2024.1 -PyYAML==6.0.1 -referencing==0.35.1 -regex==2024.5.15 -requests==2.32.3 -rich==13.7.1 -rpds-py==0.18.1 -rsa==4.9 -ruff==0.5.0 -semantic-version==2.10.0 -shellingham==1.5.4 -six==1.16.0 -sniffio==1.3.1 -soupsieve==2.5 -SQLAlchemy==2.0.31 -starlette==0.37.2 -striprtf==0.0.26 -tenacity==8.4.2 -tiktoken==0.7.0 -tomlkit==0.12.0 -toolz==0.12.1 -tqdm==4.66.4 -typer==0.12.3 -typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -ujson==5.10.0 -uritemplate==4.1.1 -urllib3==2.2.2 -uvicorn==0.30.1 -watchfiles==0.22.0 -websockets==11.0.3 -wrapt==1.16.0 -yarl==1.9.4 diff --git a/backend/src/__init__.py b/backend/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/src/data_ingestion.py b/backend/src/data_ingestion.py deleted file mode 100644 index 44435aa..0000000 --- a/backend/src/data_ingestion.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -from dotenv import load_dotenv -from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex -from llama_index.core.vector_stores import MetadataFilters, FilterCondition -from llama_index.core.tools import FunctionTool, QueryEngineTool -from llama_index.core.node_parser import SentenceSplitter -from llama_index.embeddings.jinaai import JinaEmbedding -from llama_index.core import Settings - -from typing import List, Optional - -class QueryTools: - def __init__(self, file_path: str, name: str): - load_dotenv() - self.file_path = file_path - self.name = name - - self.embed_model = JinaEmbedding( - api_key=os.getenv("JINA_API_KEY"), - model="jina-embeddings-v2-base-en", - ) - - self.documents = SimpleDirectoryReader(input_files=[file_path]).load_data() - self.splitter = SentenceSplitter(chunk_size=1024) - - self.nodes = self.splitter.get_nodes_from_documents(self.documents) - - self.vector_index = VectorStoreIndex.from_documents(self.documents, embed_model=self.embed_model) - self.summary_index = SummaryIndex(self.nodes) - - def vector_query(self, query: str, page_numbers: Optional[List[str]] = None) -> str: - page_numbers = page_numbers or [] - metadata_dicts = [ - {"key": "page_label", "value": p} for p in page_numbers - ] - - query_engine = self.vector_index.as_query_engine( - similarity_top_k=3, - filters=MetadataFilters.from_dicts( - metadata_dicts, - condition=FilterCondition.OR - ) - ) - response = query_engine.query(query) - return response - - def get_query_tools(self): - vector_query_tool = FunctionTool.from_defaults( - name=f"vector_tool_{self.name}", - fn=self.vector_query - ) - - summary_query_engine = self.summary_index.as_query_engine( - response_mode="tree_summarize", - use_async=True, - ) - - summary_tool = QueryEngineTool.from_defaults( - name=f"summary_tool_{self.name}", - query_engine=summary_query_engine, - description=f"Useful for summarization questions related to {self.name}", - ) - - return vector_query_tool, summary_tool diff --git a/backend/src/file_search.py b/backend/src/file_search.py deleted file mode 100644 index 34cffa5..0000000 --- a/backend/src/file_search.py +++ /dev/null @@ -1,68 +0,0 @@ -from llama_index.core import VectorStoreIndex -from llama_index.core.objects import ObjectIndex -from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner -from src.get_tools import get_all_tools -from llama_index.llms.openai import OpenAI -# from llama_index.llms.gemini import Gemini -from dotenv import load_dotenv -import os - -class FileAgent: - def __init__(self, use_gemini:bool=False, gemini_model:str="models/gemini-1.0-pro"): - load_dotenv() - self.use_gemini: bool = use_gemini - self.gemini_model: str = gemini_model - - os.makedirs("./data", exist_ok=True) - os.makedirs("./data/files", exist_ok=True) - - def feed_files(self): - self.all_tools = get_all_tools(folder_path="./data/files") - - if self.use_gemini: - self.llm = self._init_gemini_llm(self.gemini_model) - else: - self.llm = OpenAI(model="gpt-3.5-turbo") #TODO: Change to "gpt-4o" for the final version - - # Initialize the object index and retriever - self.obj_index = ObjectIndex.from_objects(self.all_tools, index_cls=VectorStoreIndex) - self.obj_retriever = self.obj_index.as_retriever(similarity_top_k=3) - - # Initialize the agent worker and runner - self.agent_worker = self._create_agent_worker() - self.agent = AgentRunner(self.agent_worker) - - def _init_gemini_llm(self, model): - from llama_index.llms.gemini import Gemini - return Gemini(model=model, api_key=os.getenv("GEMINI_API_KEY")) - - def _create_agent_worker(self): - system_prompt = """ - You are an agent designed to answer queries over a set of given papers. - Please always use the tools provided to answer a question. Do not rely on prior knowledge. - """ - return FunctionCallingAgentWorker.from_tools( - tool_retriever=self.obj_retriever, - llm=self.llm, - system_prompt=system_prompt.strip(), - verbose=False - ) - - def query(self, question): - response = self.agent.query(question) - return str(response) - - def reset(self): - for file in os.listdir("./data/files"): - os.remove(f"./data/files/{file}") - # if self.agent_worker: - # self.agent_worker = self._create_agent_worker() - # if self.agent: - # self.agent = AgentRunner(self.agent_worker) - -if __name__ == "__main__": - agent = FileAgent(use_gemini=False) #? Set to True if using Gemini - agent.feed_files() - response = agent.query("What is the primary focus of his work?") - print(response) - # agent.reset() \ No newline at end of file diff --git a/backend/src/get_tools.py b/backend/src/get_tools.py deleted file mode 100644 index d135189..0000000 --- a/backend/src/get_tools.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import json -from pathlib import Path -from dotenv import load_dotenv -from src.data_ingestion import QueryTools - -def get_all_tools(folder_path: str = "./data/files"): - load_dotenv() - - papers = os.listdir(folder_path) - - try: - paper_to_tools_dict = {} - for paper in papers: - try: - print(f"Getting tools for paper: {paper}") - tools = QueryTools(f"{folder_path}/{paper}", Path(paper).stem) - print(Path(paper).stem) - vector_tool, summary_tool = tools.get_query_tools() - paper_to_tools_dict[paper] = [vector_tool, summary_tool] - except Exception as e: - raise Exception(f"Error occurred: {str(e)} in paper: {paper}") - except Exception as e: - raise Exception(f"Error occurred: {str(e)}") - - all_tools = [t for paper in papers for t in paper_to_tools_dict[paper]] - return all_tools diff --git a/backend/src/webpage_search.py b/backend/src/webpage_search.py deleted file mode 100644 index f7ad7a4..0000000 --- a/backend/src/webpage_search.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import requests -from typing import List -from dotenv import load_dotenv -from llama_index.llms.openai import OpenAI -from src.get_tools import get_all_tools -from llama_index.embeddings.jinaai import JinaEmbedding -from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex -from llama_index.core.objects import ObjectIndex -from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner - - -class WebpageSearch: - def __init__(self, urls=[]): - load_dotenv() - - os.makedirs("./data", exist_ok=True) - os.makedirs("./data/webpages", exist_ok=True) - - self.llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - self.search_results = [] - self.all_tools = get_all_tools(folder_path="./data/webpages") - - def feed_urls(self, urls: List[str]): - self.urls = urls - - def search(self): - for url in self.urls: - url = "https://r.jina.ai/" + url - response = requests.get(url) - self.search_results.append(response.text) - - def save_results(self): - for i, result in enumerate(self.search_results): - with open(f"./data/webpages/search_result_{i}.txt", "w", encoding="utf-8") as f: - f.write(result) - - def index(self): - # Initialize the object index and retriever - self.obj_index = ObjectIndex.from_objects(self.all_tools, index_cls=VectorStoreIndex) - self.obj_retriever = self.obj_index.as_retriever(similarity_top_k=3) - - # Initialize the agent worker and runner - self.agent_worker = self._create_agent_worker() - self.agent = AgentRunner(self.agent_worker) - print('sfdnfsdf') - - def _create_agent_worker(self): - system_prompt = """ - You are an agent designed to answer queries over a set of given webpages. - Please always use the tools provided to answer a question. Do not rely on prior knowledge. - """ - return FunctionCallingAgentWorker.from_tools( - tool_retriever=self.obj_retriever, - llm=self.llm, - system_prompt=system_prompt.strip(), - verbose=False - ) - - def query(self, query: str): - response = self.agent.query(query) - return str(response) - - def reset(self): - for file in os.listdir("./data/webpages"): - os.remove(f"./data/webpages/{file}") - self.search_results = [] - self.documents = [] - self.vector_index = None - self.sq = None - - def feed(self, urls: List[str]): - self.feed_urls(urls) - self.search() - self.save_results() - self.index() - - -if __name__ == "__main__": - ws = WebpageSearch() - ws.reset() - ws.feed(urls=['https://docs.llamaindex.ai/en/stable/examples/embeddings/jinaai_embeddings/']) - res = ws.query("How to implement JinaAI's embedding in python?") - print(res) - with open("search_results.txt", "w") as f: - f.write(str(res)) \ No newline at end of file