as-cle-bert commited on
Commit
6ca31d3
·
verified ·
1 Parent(s): 98a3d42

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +168 -0
  2. toolsFunctions.py +85 -0
  3. usage.md +54 -0
  4. utils.py +33 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import ingest_documents, qdrant_client, List, QdrantVectorStore, VectorStoreIndex, embedder
2
+ import gradio as gr
3
+ from toolsFunctions import pubmed_tool, arxiv_tool
4
+ from llama_index.core.tools import QueryEngineTool, FunctionTool
5
+ from llama_index.core import Settings
6
+ from llama_index.llms.mistralai import MistralAI
7
+ from llama_index.core.llms import ChatMessage
8
+ from llama_index.core.agent import ReActAgent
9
+ from phoenix.otel import register
10
+ from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
11
+ import time
12
+ import os
13
+
14
+ ## Observing and tracing
15
+ PHOENIX_API_KEY = os.getenv("phoenix_api_key")
16
+ os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
17
+ os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"
18
+ tracer_provider = register(
19
+ project_name="llamaindex_hf",
20
+ )
21
+ LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
22
+
23
+ ## Global
24
+ Settings.embed_model = embedder
25
+ arxivtool = FunctionTool.from_defaults(arxiv_tool, name="arxiv_tool", description="A tool to search ArXiv (pre-print papers database) for specific papers")
26
+ pubmedtool = FunctionTool.from_defaults(pubmed_tool, name="pubmed_tool", description="A tool to search PubMed (printed medical papers database) for specific papers")
27
+ query_engine = None
28
+ message_history = [
29
+ ChatMessage(role="system", content="You are a useful assistant that has to help the user with questions that they ask about several papers they uploaded. You should base your answers on the context you can retrieve from the PDFs and, if you cannot retrieve any, search ArXiv for a potential answer. If you cannot find any viable answer, please reply that you do not know the answer to the user's question")
30
+ ]
31
+
32
+ ## Functions
33
+ def reply(message, history, files: List[str] | None, collection: str, llamaparse: bool, llamacloud_api_key: str, mistral_api_key: str):
34
+ global message_history
35
+ if mistral_api_key == "":
36
+ response = "You should provide a Mistral AI API key"
37
+ r = ""
38
+ for char in response:
39
+ r+=char
40
+ time.sleep(0.001)
41
+ yield r
42
+ else:
43
+ try:
44
+ chat_mis = MistralAI(model="mistral-small-latest", temperature=0, api_key=mistral_api_key)
45
+ chat_mis.complete("Hello, who are you?")
46
+ except Exception as e:
47
+ response = "You Mistral AI key is not valid"
48
+ r = ""
49
+ for char in response:
50
+ r+=char
51
+ time.sleep(0.001)
52
+ yield r
53
+ else:
54
+ Settings.llm = MistralAI(model="mistral-small-latest", temperature=0, api_key=mistral_api_key)
55
+ if llamaparse and llamacloud_api_key == "":
56
+ response = "If you activate LlamaParse, you should provide a LlamaCloud API key"
57
+ r = ""
58
+ for char in response:
59
+ r+=char
60
+ time.sleep(0.001)
61
+ yield r
62
+ elif message == "" or message is None:
63
+ response = "You should provide a message"
64
+ r = ""
65
+ for char in response:
66
+ r+=char
67
+ time.sleep(0.001)
68
+ yield r
69
+ elif files is None and collection == "":
70
+ res = "### WARNING! You did not specify any collection, so I only interrogated ArXiv and/or PubMed to answer your question\n\n"
71
+ agent = ReActAgent.from_tools(tools=[pubmedtool, arxivtool], verbose=True)
72
+ response = agent.chat(message = message, chat_history = message_history)
73
+ response = str(response)
74
+ message_history.append(ChatMessage(role="user", content=message))
75
+ message_history.append(ChatMessage(role="assistant", content=response))
76
+ response = res + response
77
+ r = ""
78
+ for char in response:
79
+ r+=char
80
+ time.sleep(0.001)
81
+ yield r
82
+ elif files is None and collection != "" and collection not in [c.name for c in qdrant_client.get_collections().collections]:
83
+ response = "Make sure that the name of the existing collection to use as a knowledge base is correct, because the one you provided does not exist! You can check your existing collections and their features in the dedicated tab of the app :)"
84
+ r = ""
85
+ for char in response:
86
+ r+=char
87
+ time.sleep(0.001)
88
+ yield r
89
+ elif files is not None:
90
+ if len(files) > 5:
91
+ response = "You cannot upload more than 5 files"
92
+ r = ""
93
+ for char in response:
94
+ r+=char
95
+ time.sleep(0.001)
96
+ yield r
97
+ elif collection == "":
98
+ response = "You should provide a collection name (new or existing) if you want to ingest files!"
99
+ r = ""
100
+ for char in response:
101
+ r+=char
102
+ time.sleep(0.001)
103
+ yield r
104
+ else:
105
+ collection_name = collection
106
+ index = ingest_documents(files, collection_name, llamaparse, llamacloud_api_key)
107
+ query_engine = index.as_query_engine()
108
+ rag_tool = QueryEngineTool.from_defaults(query_engine, name="papers_rag", description="A RAG engine with information from selected scientific papers")
109
+ agent = ReActAgent.from_tools(tools=[rag_tool, pubmedtool, arxivtool], verbose=True)
110
+ response = agent.chat(message = message, chat_history = message_history)
111
+ response = str(response)
112
+ message_history.append(ChatMessage(role="user", content=message))
113
+ message_history.append(ChatMessage(role="assistant", content=response))
114
+ r = ""
115
+ for char in response:
116
+ r+=char
117
+ time.sleep(0.001)
118
+ yield r
119
+ else:
120
+ vector_store = QdrantVectorStore(client = qdrant_client, collection_name=collection, enable_hybrid=True)
121
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
122
+ query_engine = index.as_query_engine()
123
+ rag_tool = QueryEngineTool.from_defaults(query_engine, name="papers_rag", description="A RAG engine with information from selected scientific papers")
124
+ agent = ReActAgent.from_tools(tools=[rag_tool, pubmedtool, arxivtool], verbose=True)
125
+ response = agent.chat(message = message, chat_history = message_history)
126
+ response = str(response)
127
+ message_history.append(ChatMessage(role="user", content=message))
128
+ message_history.append(ChatMessage(role="assistant", content=response))
129
+ r = ""
130
+ for char in response:
131
+ r+=char
132
+ time.sleep(0.001)
133
+ yield r
134
+
135
+ def to_markdown_color(grade: str):
136
+ colors = {"red": "ff0000", "yellow": "ffcc00", "green": "33cc33"}
137
+ mdcode = f"![#{colors[grade]}](https://placehold.co/15x15/{colors[grade]}/{colors[grade]}.png)"
138
+ return mdcode
139
+
140
+ def get_qdrant_collections_dets():
141
+ collections = [c.name for c in qdrant_client.get_collections().collections]
142
+ details = []
143
+ counter = 0
144
+ collections.remove("semantic_cache_med")
145
+ collections.remove("stem_cot_qa")
146
+ for collection in collections:
147
+ counter += 1
148
+ dets = qdrant_client.get_collection(collection)
149
+ p = f"### {counter}. {collection}\n\n**Number of Points**: {dets.points_count}\n\n**Status**: {to_markdown_color(dets.status)} {dets.status}\n\n"
150
+ details.append(p)
151
+ final_text = "<h2 align='center'>Available Collections</h2>\n\n"
152
+ final_text += "\n\n".join(details)
153
+ return final_text
154
+
155
+ ## Frontend
156
+ accordion = gr.Accordion(label="⚠️Set up these parameters before you start chatting!⚠️")
157
+
158
+ iface1 = gr.ChatInterface(fn=reply, additional_inputs=[gr.File(label="Upload Papers (only PDF allowed!)", file_count="multiple", file_types=[".pdf","pdf",".PDF","PDF"], value=None), gr.Textbox(label="Collection", info="Upload your papers to a collection (new or existing)", value=""), gr.Checkbox(label="Use LlamaParse", info="Needs the LlamaCloud API key", value=False), gr.Textbox(label="LlamaCloud API key", type="password", info="Set this field if you enable LlamaParse", value=""), gr.Textbox(label="Mistral AI API key", type="password", value="")], additional_inputs_accordion=accordion)
159
+ u = open("usage.md")
160
+ content = u.read()
161
+ u.close()
162
+ iface2 = gr.Blocks()
163
+ with iface2:
164
+ with gr.Row():
165
+ gr.Markdown(content)
166
+ iface3 = gr.Interface(fn=get_qdrant_collections_dets, inputs=None, outputs=gr.Markdown(label="Collections"), submit_btn="See your collections")
167
+ iface = gr.TabbedInterface([iface1, iface2, iface3], ["Chat💬", "Usage Guide⚙️", "Available Collections🔎"], title="PapersChat📝")
168
+ iface.launch(server_name="0.0.0.0", server_port=7860)
toolsFunctions.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib, urllib.request
2
+ from pydantic import Field
3
+ from datetime import datetime
4
+ from markitdown import MarkItDown
5
+ from Bio import Entrez
6
+ import xml.etree.ElementTree as ET
7
+
8
+ md = MarkItDown()
9
+
10
+ def format_today():
11
+ d = datetime.now()
12
+ if d.month < 10:
13
+ month = f"0{d.month}"
14
+ else:
15
+ month = d.month
16
+ if d.day < 10:
17
+ day = f"0{d.day}"
18
+ else:
19
+ day = d.day
20
+ if d.hour < 10:
21
+ hour = f"0{d.hour}"
22
+ else:
23
+ hour = d.hour
24
+ if d.minute < 10:
25
+ minute = f"0{d.hour}"
26
+ else:
27
+ minute = d.minute
28
+ today = f"{d.year}{month}{day}{hour}{minute}"
29
+ two_years_ago = f"{d.year-2}{month}{day}{hour}{minute}"
30
+ return today, two_years_ago
31
+
32
+ def arxiv_tool(search_query: str = Field(description="The query with which to search ArXiv database")):
33
+ """A tool to search ArXiv"""
34
+ today, two_years_ago = format_today()
35
+ query = search_query.replace(" ", "+")
36
+ url = f'http://export.arxiv.org/api/query?search_query=all:{query}&submittedDate:[{two_years_ago}+TO+{today}]&start=0&max_results=3'
37
+ data = urllib.request.urlopen(url)
38
+ content = data.read().decode("utf-8")
39
+ f = open("arxiv_results.xml", "w")
40
+ f.write(content)
41
+ f.close()
42
+ result = md.convert("arxiv_results.xml")
43
+ return result.text_content
44
+
45
+ def search_pubmed(query):
46
+ Entrez.email = "astraberte9@gmail.com" # Replace with your email
47
+ handle = Entrez.esearch(db="pubmed", term=query, retmax=3)
48
+ record = Entrez.read(handle)
49
+ handle.close()
50
+ return record["IdList"]
51
+
52
+ def fetch_pubmed_details(pubmed_ids):
53
+ Entrez.email = "your.personal@email.com" # Replace with your email
54
+ handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
55
+ records = handle.read()
56
+ handle.close()
57
+ recs = records.decode("utf-8")
58
+ f = open("biomed_results.xml", "w")
59
+ f.write(recs)
60
+ f.close()
61
+
62
+ def fetch_xml():
63
+ tree = ET.parse("biomed_results.xml")
64
+ root = tree.getroot()
65
+ parsed_articles = []
66
+ for article in root.findall('PubmedArticle'):
67
+ # Extract title
68
+ title = article.find('.//ArticleTitle')
69
+ title_text = title.text if title is not None else "No title"
70
+ # Extract abstract
71
+ abstract = article.find('.//Abstract/AbstractText')
72
+ abstract_text = abstract.text if abstract is not None else "No abstract"
73
+ # Format output
74
+ formatted_entry = f"## {title_text}\n\n**Abstract**:\n\n{abstract_text}"
75
+ parsed_articles.append(formatted_entry)
76
+ return "\n\n".join(parsed_articles)
77
+
78
+ def pubmed_tool(search_query: str = Field(description="The query with which to search PubMed database")):
79
+ """A tool to search PubMed"""
80
+ idlist = search_pubmed(search_query)
81
+ if len(idlist) == 0:
82
+ return "There is no significant match in PubMed"
83
+ fetch_pubmed_details(idlist)
84
+ content = fetch_xml()
85
+ return content
usage.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">PapersChat Usage Guide</h1>
2
+
3
+ <h3 align="center">If you find PapersChat useful, please consider to support us through donation:</h3>
4
+ <div align="center">
5
+ <a href="https://github.com/sponsors/AstraBert"><img src="https://img.shields.io/badge/sponsor-30363D?style=for-the-badge&logo=GitHub-Sponsors&logoColor=#EA4AAA" alt="GitHub Sponsors Badge"></a>
6
+ </div>
7
+
8
+ > _This guide is only on how to use **the app**, not on how to install and/or launch it or on how it works internally. For that, please refer to the [GitHub repository](https://github.com/AstraBert/PapersChat)_
9
+
10
+ ### N.B: For all the following usage cases, you should provide a Mistral API key. If you don't have one, feel free to get one [here](https://console.mistral.ai/api-keys/)
11
+
12
+ ## Use PapersChat with your documents
13
+
14
+ If you have papers that you would like to chat with, this is the right section of the guide!
15
+
16
+ In order to chat with your papers, you will need to upload them (**as PDF files**) on the dedicated "Upload Papers" widget that you can see at the bottom of the chat interface: you can upload one or more files there (remember: the more you upload, the slower their processing is going to be).
17
+
18
+ > _**You can upload a maximum of 5 files**_
19
+
20
+ Once you uploaded the files, before submitting them, you have to do two more things:
21
+
22
+ 1. Specify the collection that you want to upload the documents to (in the "Collection" area)
23
+ 2. Write your first question/message to interrogate your papers (in the message input space)
24
+
25
+ For what concerns point (1), you can give your collection whatever name you want: once you created a new collection, you can always re-use it in the future, just inputting the same name. If you do not remember all your collections, you can go to the "Your collections" tab in the application and click on "Generate" to see the list of your collections.
26
+
27
+ Point (2) is very important: if you do not send any message, PapersChat will tell you that you need to send one.
28
+
29
+ Once you uploaded the papers, specified the collection and wrote the message, you can send the message and PapersChat will:
30
+
31
+ - Ingest your documents
32
+ - Produce an answer to your questions
33
+
34
+ Congrats! Now you got the first collection and the first message!
35
+
36
+ > _**NOTE**: there is still an option we haven't talked about, i.e. the 'LlamaParse' checkbox. If you select that checkbox, you will enable LlamaParse, a tool that LlamaIndex offers [as part of its LlamaCloud services](https://docs.llamaindex.ai/en/stable/llama_cloud/llama_parse/). LlamaParse employs enhanced parsing techniques to produce a clean and well-structured data for (often messy) unstructured documents: the free tier offers the possibility of parsing 1000 pages/day. While this approach generates very good data for your collections, you have to take into account the fact that it might take quite some time to parse your documents (especially if they are dense, have lots of text-in-images or are very long). By default the LLamaParse option is disabled. **IF YOU ENABLE THE LlamaParse option, you have to provide a LlamaCloud API key!**_
37
+
38
+ ## Use PapersChat with a collection as knowledge base
39
+
40
+ Once you have uploaded all your documents, you might want to interrogate them without having to upload even more. That's where comes into hand the "collection as knowledge base" option. You can simply send a message selecting one of your existing collections as a knowledge base for PapersChat (without uploading any file) and... BAM! You will see that PapersChat replies to your questions :)
41
+
42
+ ## Use PapersChat to interrogate PubMed/ArXiv
43
+
44
+ PapersChat has access also to PubMed and ArXiv papers archives: if you do not specify a collection name and you do not upload any files, your question is used by PapersChat to search these two online databases for an answer.
45
+
46
+ ## Monitor available collections
47
+
48
+ Under the "Available Collections" tab of the application you can, by clicking on "Generate", see your collections: you can see how many data points are in these collections (these data points **do not match** with the number of papers you uploaded) and what is the status of your collections.
49
+
50
+ A brief guide to the collections status:
51
+
52
+ - "green": collection is optimized and searchable
53
+ - "yellow": collection is being optimized and you can search it
54
+ - "red": collection is not optimized and it will probably return an error if you try to search it
utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
2
+ from llama_index.core import Settings
3
+ from qdrant_client import QdrantClient
4
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
5
+ from llama_index.core import StorageContext
6
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
7
+ from llama_cloud_services import LlamaParse
8
+ from typing import List
9
+ import os
10
+
11
+
12
+ qdrant_client = QdrantClient(url=os.getenv("qdrant_url"), api_key=os.getenv("qdrant_api_key"))
13
+ embedder = HuggingFaceEmbedding(model_name="nomic-ai/modernbert-embed-base", device="cpu")
14
+ Settings.embed_model = embedder
15
+
16
+ def ingest_documents(files: List[str], collection_name: str, llamaparse: True, llamacloud_api_key: str):
17
+ vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name, enable_hybrid=True)
18
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
19
+ if llamaparse:
20
+ parser = LlamaParse(
21
+ result_type="markdown",
22
+ api_key=llamacloud_api_key
23
+ )
24
+ file_extractor = {".pdf": parser}
25
+ documents = SimpleDirectoryReader(input_files=files, file_extractor=file_extractor).load_data()
26
+ else:
27
+ documents = SimpleDirectoryReader(input_files=files).load_data()
28
+ index = VectorStoreIndex.from_documents(
29
+ documents,
30
+ storage_context=storage_context,
31
+ )
32
+ return index
33
+