Spaces:

as-cle-bert
/

PapersChat

Running

App Files Files Community

as-cle-bert commited on Feb 17

Commit

6ca31d3

verified ·

1 Parent(s): 98a3d42

Upload 4 files

Browse files

Files changed (4) hide show

app.py +168 -0
toolsFunctions.py +85 -0
usage.md +54 -0
utils.py +33 -0

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from utils import ingest_documents, qdrant_client, List, QdrantVectorStore, VectorStoreIndex, embedder
+import gradio as gr
+from toolsFunctions import pubmed_tool, arxiv_tool
+from llama_index.core.tools import QueryEngineTool, FunctionTool
+from llama_index.core import Settings
+from llama_index.llms.mistralai import MistralAI
+from llama_index.core.llms import ChatMessage
+from llama_index.core.agent import ReActAgent
+from phoenix.otel import register
+from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
+import time
+import os
+## Observing and tracing
+PHOENIX_API_KEY = os.getenv("phoenix_api_key")
+os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
+os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"
+tracer_provider = register(
+    project_name="llamaindex_hf",
+)
+LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
+## Global
+Settings.embed_model = embedder
+arxivtool = FunctionTool.from_defaults(arxiv_tool, name="arxiv_tool", description="A tool to search ArXiv (pre-print papers database) for specific papers")
+pubmedtool = FunctionTool.from_defaults(pubmed_tool, name="pubmed_tool", description="A tool to search PubMed (printed medical papers database) for specific papers")
+query_engine = None
+message_history = [
+    ChatMessage(role="system", content="You are a useful assistant that has to help the user with questions that they ask about several papers they uploaded. You should base your answers on the context you can retrieve from the PDFs and, if you cannot retrieve any, search ArXiv for a potential answer. If you cannot find any viable answer, please reply that you do not know the answer to the user's question")
+]
+## Functions
+def reply(message, history, files: List[str] | None, collection: str, llamaparse: bool, llamacloud_api_key: str, mistral_api_key: str):
+    global message_history
+    if mistral_api_key  == "":
+        response = "You should provide a Mistral AI API key"
+        r = ""
+        for char in response:
+            r+=char
+            time.sleep(0.001)
+            yield r
+    else:
+        try:
+            chat_mis = MistralAI(model="mistral-small-latest", temperature=0, api_key=mistral_api_key)
+            chat_mis.complete("Hello, who are you?")
+        except Exception as e:
+            response = "You Mistral AI key is not valid"
+            r = ""
+            for char in response:
+                r+=char
+                time.sleep(0.001)
+                yield r
+        else:
+            Settings.llm = MistralAI(model="mistral-small-latest", temperature=0, api_key=mistral_api_key)
+            if llamaparse and llamacloud_api_key == "":
+                response = "If you activate LlamaParse, you should provide a LlamaCloud API key"
+                r = ""
+                for char in response:
+                    r+=char
+                    time.sleep(0.001)
+                    yield r
+            elif message == "" or message is None:
+                response = "You should provide a message"
+                r = ""
+                for char in response:
+                    r+=char
+                    time.sleep(0.001)
+                    yield r
+            elif files is None and collection == "":
+                res = "### WARNING! You did not specify any collection, so I only interrogated ArXiv and/or PubMed to answer your question\n\n"
+                agent = ReActAgent.from_tools(tools=[pubmedtool, arxivtool], verbose=True)
+                response = agent.chat(message = message, chat_history = message_history)
+                response = str(response)
+                message_history.append(ChatMessage(role="user", content=message))
+                message_history.append(ChatMessage(role="assistant", content=response))
+                response = res + response
+                r = ""
+                for char in response:
+                    r+=char
+                    time.sleep(0.001)
+                    yield r
+            elif files is None and collection != "" and collection not in [c.name for c in qdrant_client.get_collections().collections]:
+                    response = "Make sure that the name of the existing collection to use as a knowledge base is correct, because the one you provided does not exist! You can check your existing collections and their features in the dedicated tab of the app :)"
+                    r = ""
+                    for char in response:
+                        r+=char
+                        time.sleep(0.001)
+                        yield r
+            elif files is not None:
+                if len(files) > 5:
+                    response = "You cannot upload more than 5 files"
+                    r = ""
+                    for char in response:
+                        r+=char
+                        time.sleep(0.001)
+                        yield r
+                elif collection == "":
+                    response = "You should provide a collection name (new or existing) if you want to ingest files!"
+                    r = ""
+                    for char in response:
+                        r+=char
+                        time.sleep(0.001)
+                        yield r
+                else:
+                    collection_name = collection
+                    index = ingest_documents(files, collection_name, llamaparse, llamacloud_api_key)
+                    query_engine = index.as_query_engine()
+                    rag_tool = QueryEngineTool.from_defaults(query_engine, name="papers_rag", description="A RAG engine with information from selected scientific papers")
+                    agent = ReActAgent.from_tools(tools=[rag_tool, pubmedtool, arxivtool], verbose=True)
+                    response = agent.chat(message = message, chat_history = message_history)
+                    response = str(response)
+                    message_history.append(ChatMessage(role="user", content=message))
+                    message_history.append(ChatMessage(role="assistant", content=response))
+                    r = ""
+                    for char in response:
+                        r+=char
+                        time.sleep(0.001)
+                        yield r
+            else:
+                vector_store = QdrantVectorStore(client = qdrant_client, collection_name=collection, enable_hybrid=True)
+                index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+                query_engine = index.as_query_engine()
+                rag_tool = QueryEngineTool.from_defaults(query_engine, name="papers_rag", description="A RAG engine with information from selected scientific papers")
+                agent = ReActAgent.from_tools(tools=[rag_tool, pubmedtool, arxivtool], verbose=True)
+                response = agent.chat(message = message, chat_history = message_history)
+                response = str(response)
+                message_history.append(ChatMessage(role="user", content=message))
+                message_history.append(ChatMessage(role="assistant", content=response))
+                r = ""
+                for char in response:
+                    r+=char
+                    time.sleep(0.001)
+                    yield r
+def to_markdown_color(grade: str):
+    colors = {"red": "ff0000", "yellow": "ffcc00", "green": "33cc33"}
+    mdcode = f"![#{colors[grade]}](https://placehold.co/15x15/{colors[grade]}/{colors[grade]}.png)"
+    return mdcode
+def get_qdrant_collections_dets():
+    collections = [c.name for c in qdrant_client.get_collections().collections]
+    details = []
+    counter = 0
+    collections.remove("semantic_cache_med")
+    collections.remove("stem_cot_qa")
+    for collection in collections:
+        counter += 1
+        dets = qdrant_client.get_collection(collection)
+        p = f"### {counter}. {collection}\n\n**Number of Points**: {dets.points_count}\n\n**Status**: {to_markdown_color(dets.status)} {dets.status}\n\n"
+        details.append(p)
+    final_text = "<h2 align='center'>Available Collections</h2>\n\n"
+    final_text += "\n\n".join(details)
+    return final_text
+## Frontend
+accordion = gr.Accordion(label="⚠️Set up these parameters before you start chatting!⚠️")
+iface1 = gr.ChatInterface(fn=reply, additional_inputs=[gr.File(label="Upload Papers (only PDF allowed!)", file_count="multiple", file_types=[".pdf","pdf",".PDF","PDF"], value=None), gr.Textbox(label="Collection", info="Upload your papers to a collection (new or existing)", value=""), gr.Checkbox(label="Use LlamaParse", info="Needs the LlamaCloud API key", value=False), gr.Textbox(label="LlamaCloud API key", type="password", info="Set this field if you enable LlamaParse", value=""), gr.Textbox(label="Mistral AI API key", type="password", value="")], additional_inputs_accordion=accordion)
+u = open("usage.md")
+content = u.read()
+u.close()
+iface2 = gr.Blocks()
+with iface2:
+    with gr.Row():
+        gr.Markdown(content)
+iface3 = gr.Interface(fn=get_qdrant_collections_dets, inputs=None, outputs=gr.Markdown(label="Collections"), submit_btn="See your collections")
+iface = gr.TabbedInterface([iface1, iface2, iface3], ["Chat💬", "Usage Guide⚙️", "Available Collections🔎"], title="PapersChat📝")
+iface.launch(server_name="0.0.0.0", server_port=7860)

toolsFunctions.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import urllib, urllib.request
+from pydantic import Field
+from datetime import datetime
+from markitdown import MarkItDown
+from Bio import Entrez
+import xml.etree.ElementTree as ET
+md = MarkItDown()
+def format_today():
+    d = datetime.now()
+    if d.month < 10:
+          month = f"0{d.month}"
+    else:
+        month = d.month
+    if d.day < 10:
+        day = f"0{d.day}"
+    else:
+        day = d.day
+    if d.hour < 10:
+        hour = f"0{d.hour}"
+    else:
+        hour = d.hour
+    if d.minute < 10:
+        minute = f"0{d.hour}"
+    else:
+        minute = d.minute
+    today = f"{d.year}{month}{day}{hour}{minute}"
+    two_years_ago = f"{d.year-2}{month}{day}{hour}{minute}"
+    return today, two_years_ago
+def arxiv_tool(search_query: str = Field(description="The query with which to search ArXiv database")):
+    """A tool to search ArXiv"""
+    today, two_years_ago = format_today()
+    query = search_query.replace(" ", "+")
+    url = f'http://export.arxiv.org/api/query?search_query=all:{query}&submittedDate:[{two_years_ago}+TO+{today}]&start=0&max_results=3'
+    data = urllib.request.urlopen(url)
+    content = data.read().decode("utf-8")
+    f = open("arxiv_results.xml", "w")
+    f.write(content)
+    f.close()
+    result = md.convert("arxiv_results.xml")
+    return result.text_content
+def search_pubmed(query):
+    Entrez.email = "astraberte9@gmail.com"  # Replace with your email
+    handle = Entrez.esearch(db="pubmed", term=query, retmax=3)
+    record = Entrez.read(handle)
+    handle.close()
+    return record["IdList"]
+def fetch_pubmed_details(pubmed_ids):
+    Entrez.email = "your.personal@email.com"  # Replace with your email
+    handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
+    records = handle.read()
+    handle.close()
+    recs = records.decode("utf-8")
+    f = open("biomed_results.xml", "w")
+    f.write(recs)
+    f.close()
+def fetch_xml():
+    tree = ET.parse("biomed_results.xml")
+    root = tree.getroot()
+    parsed_articles = []
+    for article in root.findall('PubmedArticle'):
+        # Extract title
+        title = article.find('.//ArticleTitle')
+        title_text = title.text if title is not None else "No title"
+        # Extract abstract
+        abstract = article.find('.//Abstract/AbstractText')
+        abstract_text = abstract.text if abstract is not None else "No abstract"
+        # Format output
+        formatted_entry = f"## {title_text}\n\n**Abstract**:\n\n{abstract_text}"
+        parsed_articles.append(formatted_entry)
+    return "\n\n".join(parsed_articles)
+def pubmed_tool(search_query: str = Field(description="The query with which to search PubMed database")):
+    """A tool to search PubMed"""
+    idlist = search_pubmed(search_query)
+    if len(idlist) == 0:
+        return "There is no significant match in PubMed"
+    fetch_pubmed_details(idlist)
+    content = fetch_xml()
+    return content

usage.md ADDED Viewed

	@@ -0,0 +1,54 @@

+<h1 align="center">PapersChat Usage Guide</h1>
+<h3 align="center">If you find PapersChat useful, please consider to support us through donation:</h3>
+<div align="center">
+    <a href="https://github.com/sponsors/AstraBert"><img src="https://img.shields.io/badge/sponsor-30363D?style=for-the-badge&logo=GitHub-Sponsors&logoColor=#EA4AAA" alt="GitHub Sponsors Badge"></a>
+</div>
+> _This guide is only on how to use **the app**, not on how to install and/or launch it or on how it works internally. For that, please refer to the [GitHub repository](https://github.com/AstraBert/PapersChat)_
+### N.B: For all the following usage cases, you should provide a Mistral API key. If you don't have one, feel free to get one [here](https://console.mistral.ai/api-keys/)
+## Use PapersChat with your documents
+If you have papers that you would like to chat with, this is the right section of the guide!
+In order to chat with your papers, you will need to upload them (**as PDF files**) on the dedicated "Upload Papers" widget that you can see at the bottom of the chat interface: you can upload one or more files there (remember: the more you upload, the slower their processing is going to be).
+> _**You can upload a maximum of 5 files**_
+Once you uploaded the files, before submitting them, you have to do two more things:
+1. Specify the collection that you want to upload the documents to (in the "Collection" area)
+2. Write your first question/message to interrogate your papers (in the message input space)
+For what concerns point (1), you can give your collection whatever name you want: once you created a new collection, you can always re-use it in the future, just inputting the same name. If you do not remember all your collections, you can go to the "Your collections" tab in the application and click on "Generate" to see the list of your collections.
+Point (2) is very important: if you do not send any message, PapersChat will tell you that you need to send one.
+Once you uploaded the papers, specified the collection and wrote the message, you can send the message and PapersChat will:
+- Ingest your documents
+- Produce an answer to your questions
+Congrats! Now you got the first collection and the first message!
+> _**NOTE**: there is still an option we haven't talked about, i.e. the 'LlamaParse' checkbox. If you select that checkbox, you will enable LlamaParse, a tool that LlamaIndex offers [as part of its LlamaCloud services](https://docs.llamaindex.ai/en/stable/llama_cloud/llama_parse/). LlamaParse employs enhanced parsing techniques to produce a clean and well-structured data for (often messy) unstructured documents: the free tier offers the possibility of parsing 1000 pages/day. While this approach generates very good data for your collections, you have to take into account the fact that it might take quite some time to parse your documents (especially if they are dense, have lots of text-in-images or are very long). By default the LLamaParse option is disabled. **IF YOU ENABLE THE LlamaParse option, you have to provide a LlamaCloud API key!**_
+## Use PapersChat with a collection as knowledge base
+Once you have uploaded all your documents, you might want to interrogate them without having to upload even more. That's where comes into hand the "collection as knowledge base" option. You can simply send a message selecting one of your existing collections as a knowledge base for PapersChat (without uploading any file) and... BAM! You will see that PapersChat replies to your questions :)
+## Use PapersChat to interrogate PubMed/ArXiv
+PapersChat has access also to PubMed and ArXiv papers archives: if you do not specify a collection name and you do not upload any files, your question is used by PapersChat to search these two online databases for an answer.
+## Monitor available collections
+Under the "Available Collections" tab of the application you can, by clicking on "Generate", see your collections: you can see how many data points are in these collections (these data points **do not match** with the number of papers you uploaded) and what is the status of your collections.
+A brief guide to the collections status:
+- "green": collection is optimized and searchable
+- "yellow": collection is being optimized and you can search it
+- "red": collection is not optimized and it will probably return an error if you try to search it

utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Settings
+from qdrant_client import QdrantClient
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
+from llama_index.core import StorageContext
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_cloud_services import LlamaParse
+from typing import List
+import os
+qdrant_client = QdrantClient(url=os.getenv("qdrant_url"), api_key=os.getenv("qdrant_api_key"))
+embedder = HuggingFaceEmbedding(model_name="nomic-ai/modernbert-embed-base", device="cpu")
+Settings.embed_model = embedder
+def ingest_documents(files: List[str], collection_name: str, llamaparse: True, llamacloud_api_key: str):
+    vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name, enable_hybrid=True)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    if llamaparse:
+        parser = LlamaParse(
+            result_type="markdown",
+            api_key=llamacloud_api_key
+        )
+        file_extractor = {".pdf": parser}
+        documents = SimpleDirectoryReader(input_files=files, file_extractor=file_extractor).load_data()
+    else:
+        documents = SimpleDirectoryReader(input_files=files).load_data()
+    index = VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+    )
+    return index