{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "adcfdba2", "metadata": {}, "outputs": [], "source": [ "# import some packages\n", "import os\n", "\n", "from dotenv import load_dotenv\n", "from langchain.document_loaders import PyPDFLoader\n", "#from langchain.chat_models import ChatCohere\n", "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", "from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings\n", "from langchain.vectorstores import FAISS, Chroma\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.llms import HuggingFaceTextGenInference\n", "from langchain.chains.conversation.memory import (\n", " ConversationBufferMemory,\n", " ConversationBufferWindowMemory,\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "id": "2d85c6d9", "metadata": {}, "outputs": [], "source": [ "# Set api keys\n", "load_dotenv(\"API.env\") # put all the API tokens here, such as openai, huggingface...\n", "HUGGINGFACEHUB_API_TOKEN = os.getenv(\"HUGGINGFACEHUB_API_TOKEN\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "ffd3db32", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/data2/yinghanz/codes/machine_learning_projects/llm/venv/hftest/lib/python3.10/site-packages/pydantic/_internal/_fields.py:151: UserWarning: Field \"model_id\" has conflict with protected namespace \"model_\".\n", "\n", "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n", " warnings.warn(\n", "/mnt/data2/yinghanz/codes/machine_learning_projects/llm/venv/hftest/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "# Set inference link, use this online one for easier reproduce\n", "inference_api_url = 'https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta'\n", "# Recommend using better LLMs, such as Mixtral 7x8B\n", "\n", "llm = HuggingFaceTextGenInference(\n", " verbose=True, # Provides detailed logs of operation\n", " max_new_tokens=1024, # Maximum number of token that can be generated.\n", " top_p=0.95, # Threshold for controlling randomness in text generation process. \n", " typical_p=0.95, #\n", " temperature=0.1, # For choosing probable words.\n", " inference_server_url=inference_api_url, # URL des Inferenzservers\n", " timeout=120, # Timeout for connection with the url\n", " )\n", "\n", "# Alternative, you can load model locally, e.g.:\n", "# model_path = \"where/you/store/local/models/zephyr-7b-beta\" # change this to your model path\n", "# model = AutoModelForCausalLM.from_pretrained(model_path, device_map=\"auto\")\n", "# tokenizer = AutoTokenizer.from_pretrained(model_path)\n", "# pipe = pipeline(\n", "# \"text-generation\", model=model, tokenizer=tokenizer, max_new_tokens=1024, model_kwargs={\"temperature\":0.1}\n", "# )\n", "# llm = HuggingFacePipeline(pipeline=pipe)" ] }, { "cell_type": "code", "execution_count": 4, "id": "2d5bacd5", "metadata": {}, "outputs": [], "source": [ "# Function for reading and chunking text\n", "def load_pdf_as_docs(pdf_path, loader_module=None):\n", " if pdf_path.endswith('.pdf'): # single file\n", " pdf_docs = [pdf_path]\n", " else: # a directory\n", " pdf_docs = [os.path.join(pdf_path, f) for f in os.listdir(pdf_path) if f.endswith('.pdf')]\n", " \n", " docs = []\n", " \n", " if loader_module is None: # Set PDFLoader\n", " loader_module = PyPDFLoader\n", " for pdf in pdf_docs:\n", " loader = loader_module(pdf)\n", " doc = loader.load()\n", " docs.extend(doc)\n", " \n", " return docs\n", "\n", "def get_doc_chunks(docs, splitter=None):\n", " \"\"\"Split docs into chunks.\"\"\"\n", " \n", " if splitter is None:\n", " splitter = RecursiveCharacterTextSplitter(\n", " separators=[\"\\n\\n\", \"\\n\"], chunk_size=256, chunk_overlap=128\n", " )\n", " chunks = splitter.split_documents(docs)\n", " \n", " return chunks" ] }, { "cell_type": "code", "execution_count": 5, "id": "8cd31248", "metadata": {}, "outputs": [], "source": [ "# Specify the directory containing your PDFs\n", "# directory = \"C:\\\\Orga\\\\FestBatt\\\\FB2\\\\LISA\\\\Literature\"\n", "directory = \"FestbattLiterature\" # change to your pdf dictory\n", "\n", "# Find and parse all PDFs in the directory\n", "pdf_docs = load_pdf_as_docs(directory, PyPDFLoader)\n", "\n", "document_chunks = get_doc_chunks(pdf_docs)" ] }, { "cell_type": "code", "execution_count": 6, "id": "7bf62c76", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/data2/yinghanz/codes/machine_learning_projects/llm/venv/hftest/lib/python3.10/site-packages/torch/cuda/__init__.py:141: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", " return torch._C._cuda_getDeviceCount() > 0\n" ] } ], "source": [ "# Set embedding\n", "embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5') # choose the one you like\n", "\n", "# Set vectorstore, e.g. FAISS\n", "texts = [\"LISA - Lithium Ion Solid-state Assistant\"]\n", "vectorstore = FAISS.from_texts(texts, embeddings) # this is a workaround as FAISS cannot be initilized by FAISS(embedding_function=embeddings), waiting for Langchain fix\n", "# You may also use Chroma\n", "# vectorstore = Chroma(embedding_function=embeddings)" ] }, { "cell_type": "code", "execution_count": 7, "id": "73d560de", "metadata": {}, "outputs": [], "source": [ "# Create retrievers" ] }, { "cell_type": "code", "execution_count": 12, "id": "e5796990", "metadata": {}, "outputs": [], "source": [ "# Some advanced RAG, with parent document retriever, hybrid-search and rerank\n", "\n", "# 1. ParentDocumentRetriever. Note: this will take a long time (~several minutes)\n", "\n", "from langchain.storage import InMemoryStore\n", "from langchain.retrievers import ParentDocumentRetriever\n", "# For local storage, ref: https://stackoverflow.com/questions/77385587/persist-parentdocumentretriever-of-langchain\n", "store = InMemoryStore()\n", "\n", "parent_splitter = RecursiveCharacterTextSplitter(separators=[\"\\n\\n\", \"\\n\"], chunk_size=512, chunk_overlap=256)\n", "child_splitter = RecursiveCharacterTextSplitter(separators=[\"\\n\\n\", \"\\n\"], chunk_size=256, chunk_overlap=128)\n", "\n", "parent_doc_retriver = ParentDocumentRetriever(\n", " vectorstore=vectorstore,\n", " docstore=store,\n", " child_splitter=child_splitter,\n", " parent_splitter=parent_splitter,\n", ")\n", "parent_doc_retriver.add_documents(pdf_docs)" ] }, { "cell_type": "code", "execution_count": 11, "id": "bc299740", "metadata": {}, "outputs": [], "source": [ "# 2. Hybrid search\n", "from langchain.retrievers import BM25Retriever\n", "\n", "bm25_retriever = BM25Retriever.from_documents(document_chunks, k=5) # 1/2 of dense retriever, experimental value" ] }, { "cell_type": "code", "execution_count": 13, "id": "2eb8bc8f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "config.json: 100%|██████████| 801/801 [00:00<00:00, 2.96MB/s]\n", "model.safetensors: 100%|██████████| 2.24G/2.24G [00:06<00:00, 359MB/s]\n", "tokenizer_config.json: 100%|██████████| 443/443 [00:00<00:00, 2.68MB/s]\n", "sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 405MB/s]\n", "tokenizer.json: 100%|██████████| 17.1M/17.1M [00:00<00:00, 354MB/s]\n", "special_tokens_map.json: 100%|██████████| 279/279 [00:00<00:00, 1.31MB/s]\n" ] } ], "source": [ "# 3. Rerank\n", "\"\"\"\n", "Ref:\n", "https://medium.aiplanet.com/advanced-rag-cohere-re-ranker-99acc941601c\n", "https://github.com/langchain-ai/langchain/issues/13076\n", "good to read:\n", "https://teemukanstren.com/2023/12/25/llmrag-based-question-answering/\n", "\"\"\"\n", "from __future__ import annotations\n", "from typing import Dict, Optional, Sequence\n", "from langchain.schema import Document\n", "from langchain.pydantic_v1 import Extra, root_validator\n", "\n", "from langchain.callbacks.manager import Callbacks\n", "from langchain.retrievers.document_compressors.base import BaseDocumentCompressor\n", "\n", "from sentence_transformers import CrossEncoder\n", "\n", "model_name = \"BAAI/bge-reranker-large\" #\n", "\n", "class BgeRerank(BaseDocumentCompressor):\n", " model_name:str = model_name\n", " \"\"\"Model name to use for reranking.\"\"\" \n", " top_n: int = 10 \n", " \"\"\"Number of documents to return.\"\"\"\n", " model:CrossEncoder = CrossEncoder(model_name)\n", " \"\"\"CrossEncoder instance to use for reranking.\"\"\"\n", "\n", " def bge_rerank(self,query,docs):\n", " model_inputs = [[query, doc] for doc in docs]\n", " scores = self.model.predict(model_inputs)\n", " results = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)\n", " return results[:self.top_n]\n", "\n", "\n", " class Config:\n", " \"\"\"Configuration for this pydantic object.\"\"\"\n", "\n", " extra = Extra.forbid\n", " arbitrary_types_allowed = True\n", "\n", " def compress_documents(\n", " self,\n", " documents: Sequence[Document],\n", " query: str,\n", " callbacks: Optional[Callbacks] = None,\n", " ) -> Sequence[Document]:\n", " \"\"\"\n", " Compress documents using BAAI/bge-reranker models.\n", "\n", " Args:\n", " documents: A sequence of documents to compress.\n", " query: The query to use for compressing the documents.\n", " callbacks: Callbacks to run during the compression process.\n", "\n", " Returns:\n", " A sequence of compressed documents.\n", " \"\"\"\n", " \n", " if len(documents) == 0: # to avoid empty api call\n", " return []\n", " doc_list = list(documents)\n", " _docs = [d.page_content for d in doc_list]\n", " results = self.bge_rerank(query, _docs)\n", " final_results = []\n", " for r in results:\n", " doc = doc_list[r[0]]\n", " doc.metadata[\"relevance_score\"] = r[1]\n", " final_results.append(doc)\n", " return final_results\n", " \n", " \n", "from langchain.retrievers import ContextualCompressionRetriever" ] }, { "cell_type": "code", "execution_count": 14, "id": "af780912", "metadata": {}, "outputs": [], "source": [ "# Stack all the retrievers together\n", "from langchain.retrievers import EnsembleRetriever\n", "# Ensemble all above\n", "ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, parent_doc_retriver], weights=[0.5, 0.5])\n", "\n", "# Re-rank\n", "compressor = BgeRerank()\n", "rerank_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor, base_retriever=ensemble_retriever\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "id": "beb9ab21", "metadata": {}, "outputs": [], "source": [ "## Now begin to build Q&A system\n", "class RAGChain:\n", " def __init__(\n", " self, memory_key=\"chat_history\", output_key=\"answer\", return_messages=True\n", " ):\n", " self.memory_key = memory_key\n", " self.output_key = output_key\n", " self.return_messages = return_messages\n", "\n", " def create(self, retriver, llm):\n", " memory = ConversationBufferWindowMemory( # ConversationBufferMemory(\n", " memory_key=self.memory_key,\n", " return_messages=self.return_messages,\n", " output_key=self.output_key,\n", " )\n", "\n", " # https://github.com/langchain-ai/langchain/issues/4608\n", " conversation_chain = ConversationalRetrievalChain.from_llm(\n", " llm=llm,\n", " retriever=retriver,\n", " memory=memory,\n", " return_source_documents=True,\n", " rephrase_question=False, # disable rephrase, for test purpose\n", " get_chat_history=lambda x: x,\n", " )\n", " \n", " return conversation_chain\n", " \n", " \n", "rag_chain = RAGChain()\n", "lisa_qa_conversation = rag_chain.create(rerank_retriever, llm)" ] }, { "cell_type": "code", "execution_count": 16, "id": "59159951", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/data2/yinghanz/codes/machine_learning_projects/llm/venv/hftest/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n", " warn_deprecated(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Two common solid electrolytes are LLZO (lithium lanthanum zirconate titanate) and sulfide-based solid electrolytes, as mentioned in the context provided.\n" ] } ], "source": [ "# Now begin to ask question\n", "question = \"Please name two common solid electrolytes.\"\n", "result = lisa_qa_conversation({\"question\":question, \"chat_history\": []})\n", "print(result[\"answer\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "f5e3c7b5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "id": "d736960b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "Running on public URL: https://3a0ee58b7378104912.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Answer: Two common solid electrolytes used in lithium metal batteries are poly(ethylene oxide)-based solid electrolytes and lithium phosphate/phosphite or lithium sulfate/sulfite layers.\n", "Source document: [Document(page_content='electrolytes (SEs) and in contrast to many liquid electrolytes, SEs are stable under high\\nelectrochemical oxidation potentials up to 5.0 V vs Li/Li+[8]. In addition, solid composite', metadata={'source': 'FestbattLiteraturemini/materials-14-03472-v2.pdf', 'page': 0, 'relevance_score': 0.35495195}), Document(page_content='chieflyforapplicationinelectricvehicles,callsforanodematerialswith\\nimproved practical specific capacity as compared to the theoretical\\ncapacityof372mAhg−1ofgraphite[1,2].Overcominglimitationsof\\norganiccarbonate-basedliquidelectrolytesduetothecomplexinter-\\nfacialchemistryandflammabilityisalsofundamentalindesigningsafer\\nLIBs[3].Inthisregard,researcheffortsaredevotedtoreplacetheli-\\nquid electrolytes with highly-conductive solid electrolytes aiming to', metadata={'source': 'FestbattLiteraturemini/1-s2.0-S1388248120301582-main.pdf', 'page': 0, 'relevance_score': 0.024606787}), Document(page_content='and power density, Li metal as a high energy density anode can be employed with solid\\nelectrolytes (SEs) and in contrast to many liquid electrolytes, SEs are stable under high', metadata={'source': 'FestbattLiteraturemini/materials-14-03472-v2.pdf', 'page': 0, 'relevance_score': 0.014535204}), Document(page_content='+depletion and concentration polarization, immobilized anions,\\nsingle-ion versus dual-ion conduction, Li+diffusion versus Li+migration, limiting current, Li dendrites\\n1. INTRODUCTION\\nSolid electrolytes are currently regarded as the most promising\\nenabler of lithium metal batteries (LMBs), which, at least\\ntheoretically can o ffer enhanced speci fic energies and energy\\ndensities compared to state-of-the-art liquid electrolyte Li-ionbatteries (LIBs).\\n1−4The poly(ethylene oxide)-based solid', metadata={'source': 'FestbattLiteraturemini/stolz-et-al-2022-single-ion-versus-dual-ion-conducting-electrolytes-the-relevance-of-concentration-polarization-in.pdf', 'page': 0, 'relevance_score': 0.013416832}), Document(page_content='J. Effective Optimization of High Voltage Solid State LithiumBatteries by Using Poly(ethylene oxide) Based Polymer Electrolytewith Semi-Interpenetrating Network. Adv. Funct. Mater. 2020 ,30,\\n2006289.\\n( 1 8 )H o m a n n ,G . ;S t o l z ,L . ;W i n t e r ,M . ;K a s n a t s c h e e w ,J .\\nElimination of “Voltage Noise ”of Poly (Ethylene Oxide)-Based\\nSolid Electrolytes in High-Voltage Lithium Batteries: Linear versusNetwork Polymers. iScience 2020 ,23, 101225.', metadata={'source': 'FestbattLiteraturemini/stolz-et-al-2022-single-ion-versus-dual-ion-conducting-electrolytes-the-relevance-of-concentration-polarization-in.pdf', 'page': 6, 'relevance_score': 0.0091508655}), Document(page_content='electrolytes, whichmayinsituformahomogeneous lithium\\nphosphate/phosphite orlithium sulfate/sulfite layerare\\npromising forthemodification ofnewelectrolytes. These\\nresultshighlight thepossibility ofsolvinginterfacial prob-', metadata={'source': 'FestbattLiteraturemini/Angew Chem Int Ed - 2022 - Zuo - Impact of the Chlorination of Lithium Argyrodites on the Electrolyte Cathode Interface in.pdf', 'page': 6, 'relevance_score': 0.0059685726}), Document(page_content='“k” is Boltzmann ’s constant. Note that the “ σ” data obtained at \\nSmall 2020, 16, 2000279\\nFigure 1. Schematic illustration of the interface between LLZO SE and \\nLFP cathode. “magnified view” shows the ILE interlayer between the \\ncathode and the solid electrolyte enabling fast ionic transport.', metadata={'source': 'FestbattLiteraturemini/Small - 2020 - Pervez - Overcoming the Interfacial Limitations Imposed by the Solid Solid Interface in Solid‐State.pdf', 'page': 1, 'relevance_score': 0.0007537542}), Document(page_content='affecttheelectrochemical decomposition behavior aswellas\\ntheinterfacial reaction between SEandNCM.Thus,new\\nelectrolytes, whichmayinsituformahomogeneous lithium\\nphosphate/phosphite orlithium sulfate/sulfite layerare', metadata={'source': 'FestbattLiteraturemini/Angew Chem Int Ed - 2022 - Zuo - Impact of the Chlorination of Lithium Argyrodites on the Electrolyte Cathode Interface in.pdf', 'page': 6, 'relevance_score': 0.0005863567}), Document(page_content='From a managerial point of view, it is key that the two efforts grow simultaneously as a combined solution to tribology’s digital transformation.\\nCoping with the challenge of defining the common terms which describe tribological specimens, equipment', metadata={'source': 'FestbattLiteraturemini/s41597-022-01429-9.pdf', 'page': 1, 'relevance_score': 7.6203854e-05})]\n", "Answer: Unfortunately, the provided context does not include information on how to synthesize gc-LPSC. You may need to consult additional resources or contact the authors of the context provided for more information.\n", "Source document: [Document(page_content='or high temperature steps which may affect the scalability of the \\nsynthesis process and increase the cost.Li-garnets are promising inorganic ceramic solid electrolytes for lithium metal', metadata={'source': 'FestbattLiteraturemini/Small - 2020 - Pervez - Overcoming the Interfacial Limitations Imposed by the Solid Solid Interface in Solid‐State.pdf', 'page': 0, 'relevance_score': 0.2680533}), Document(page_content='enabler of lithium metal batteries (LMBs), which, at least\\ntheoretically can o ffer enhanced speci fic energies and energy\\ndensities compared to state-of-the-art liquid electrolyte Li-ionbatteries (LIBs).\\n1−4The poly(ethylene oxide)-based solid', metadata={'source': 'FestbattLiteraturemini/stolz-et-al-2022-single-ion-versus-dual-ion-conducting-electrolytes-the-relevance-of-concentration-polarization-in.pdf', 'page': 0, 'relevance_score': 0.14643796}), Document(page_content='Lithium metal batteries (LMBs) promise higher energy densities\\nand speci fic energies compared to the state-of-the-art (SOTA) Li\\nion batteries (LIBs) [1–4]. However, a suitable solid electrolyte\\nor liquid electrolyte/separator system for high-performance andsafe cell (-stack) operation remains the key for application andis the predominant actual focus of research and development(R&D) [5–11].\\nThe organic -,i.e.solid polymer-based electrolytes (SPEs) are', metadata={'source': 'FestbattLiteraturemini/1-s2.0-S1369702120304521-main.pdf', 'page': 0, 'relevance_score': 0.046960726}), Document(page_content='Performance of Solid Polymer Electrolytes for Use in Solid-StateLithium Batteries. iScience 2020 ,23, 101597.\\n(8) Jung, K. N.; Shin, H. S.; Park, M. S.; Lee, J. W. Solid-State\\nLithium Batteries: Bipolar Design, Fabrication, and Electrochemistry.\\nChemElectroChem 2019 ,6, 3842−3859.\\n(9) Simonetti, E.; Carewska, M.; Di Carli, M.; Moreno, M.; De\\nFrancesco, M.; Appetecchi, G. B. Towards improvement of the\\nelectrochemical properties of ionic liquid-containing polyethylene', metadata={'source': 'FestbattLiteraturemini/stolz-et-al-2022-single-ion-versus-dual-ion-conducting-electrolytes-the-relevance-of-concentration-polarization-in.pdf', 'page': 6, 'relevance_score': 0.01367707}), Document(page_content='adjusted to ensure a balance between the number of active charge carriers (Li ions) and viscosity of the IL. The ILE was further dried at 60 °C under vacuum to decrease the water content below 5 ppm, as \\ndetected by Karl–Fischer measurements.\\nPreparation of LFP Positive Electrodes (Cathodes): Carbon-coated LFP \\nwas synthesized via a solid state method.\\n[56] Stoichiometric amounts \\nof lithium carbonate (Li 2CO 3; Aldrich, 99.95%), ammonium hydrogen', metadata={'source': 'FestbattLiteraturemini/Small - 2020 - Pervez - Overcoming the Interfacial Limitations Imposed by the Solid Solid Interface in Solid‐State.pdf', 'page': 8, 'relevance_score': 0.0011098508}), Document(page_content='avoidanyshortcircuit,astheSi-FLGdiskhasØ18mm).TheBLPEfortheionicconductivitymeasurementwaspreparedfollowingthesameprocedureusedforSi-FLG/BLPE,butnoelectrodewasusedinthiscase.\\nThe Si-FLG/BLPE was assembled in an ECC-Std cell (EL-cell,\\nGermany) with a 18 mm Li metal disk anode (200 µm thick,\\nAlbermarle)inatwo-electrodesconfiguration.TheLi||Si-FLGcellwith\\nIL_liqwasassembledusingaglasswoolWhatmanseparatordrenched\\nwith200µLofelectrolyte.Testcellsweregalvanostaticallycycled(GC)', metadata={'source': 'FestbattLiteraturemini/1-s2.0-S1388248120301582-main.pdf', 'page': 1, 'relevance_score': 0.0005449906}), Document(page_content='LiNbO 3layer was deposited on the garnet type lithium ion conductor Li 6.45Al0.05La3Zr1.6Ta0.4O12(LLZTO) to improve its\\ninterface to lithium metal and reduce dendrite formation. The application of the thin film reduced the interface resistance between', metadata={'source': 'FestbattLiteraturemini/Mann_2022_J._Electrochem._Soc._169_040564.pdf', 'page': 1, 'relevance_score': 8.970482e-05}), Document(page_content='Zenodo (CERN & OpenAIRE 2013). The listed repositories are all generic and represent only a \\nselection of the existing open-source systems (Amorim et al. 2017).\\nA second type of system in addition to the repositories, which is also increasingly used in', metadata={'source': 'FestbattLiteraturemini/kadi4mat.pdf', 'page': 1, 'relevance_score': 7.65131e-05}), Document(page_content='A second type of system in addition to the repositories, which is also increasingly used in \\nexperimentally oriented research areas, are the electronic lab notebooks (ELN) (Rubacha, Rattan', metadata={'source': 'FestbattLiteraturemini/kadi4mat.pdf', 'page': 1, 'relevance_score': 7.6393466e-05})]\n", "Answer: Yes, the paper \"Kadi4Mat: A Research Data Infrastructure for Materials Science\" by C, Schoof, E, Tosato, G, Zhao, Y, Zschumme, P, and Selzer, M, published in the Data Science Journal in 2021, provides an overview of Kadi4Mat, a research data infrastructure for materials science. It discusses the components of Kadi4Mat, including the electronic laboratory notebook (ELN), data management, and data analysis, and provides examples of how Kadi4Mat has been used in materials science research. This paper can help you gain a deeper understanding of Kadi4Mat and its potential applications in materials science research.\n", "Source document: [Document(page_content='deeper understanding of the phenomena that govern friction and wear. Missing community-wide data', metadata={'source': 'FestbattLiteraturemini/s41597-022-01429-9.pdf', 'page': 0, 'relevance_score': 0.06298193}), Document(page_content='32. Brandt, N. et al. Kadi4mat: A research data infrastructure for materials science. Data Sci. J. 20, 1–14 (2021).\\n 33. Brandt, N. et al. Managing FAIR tribological data using Kadi4Mat. Data 7, 15 (2022).\\n 34. Garabedian, N. et al . FAIR Data Package of a Tribological Showcase Pin-on-Disk Experiment. Zenodo https://doi.org/10.5281/\\nzenodo.5720626 (2021).\\n 35. Garabedian, N. et al. Generating FAIR research data in experimental tribology. Zenodo https://doi.org/10.5281/zenodo.6349293 (2022).', metadata={'source': 'FestbattLiteraturemini/s41597-022-01429-9.pdf', 'page': 10, 'relevance_score': 0.03710895}), Document(page_content='C, Schoof, E, Tosato, G, Zhao, \\nY, Zschumme, P and Selzer, M. \\n2021. Kadi4Mat: A Research \\nData Infrastructure for \\nMaterials Science. Data Science \\nJournal , 20: 8, pp. 1– 14. DOI: \\nhttps://doi.org/10.5334/dsj-\\n2021-008\\nSubmitted: 16 October 2020 \\nAccepted: 27 January 2021 \\nPublished: 10 February 2021\\nCOPYRIGHT: \\n© 2021 The Author(s). This is an \\nopen-access article distributed \\nunder the terms of the Creative \\nCommons Attribution 4.0 \\nInternational License (CC-BY', metadata={'source': 'FestbattLiteraturemini/kadi4mat.pdf', 'page': 13, 'relevance_score': 0.03163605}), Document(page_content='Brandt, N. 2020. Kadi4Mat – Karlsruhe Data Infrastructure for Materials Science . URL: https://kadi.iam-cms.\\nkit.edu (visited on Sept. 30, 2020).\\nBrandt, N, et al. Oct. 16, 2020. IAM-CMS/Kadi: Kadi4Mat. Version 0.2.0. Zenodo . DOI: https://doi.\\norg/10.5281/ZENODO.4088270\\nCantor, S and Scavo, T. 2005. Shibboleth Architecture. Protocols and Profiles, 10: 16. DOI: https://doi.\\norg/10.26869/TI.66.1\\nCARPi, N, Minges, A and Piel, M. Apr. 14, 2017. eLabFTW: An Open Source Laboratory Notebook for', metadata={'source': 'FestbattLiteraturemini/kadi4mat.pdf', 'page': 11, 'relevance_score': 0.01203158}), Document(page_content='various tools and technical infrastructures. The components can be used by web- and desktop-\\nbased applications, via uniform interfaces. Both a graphical and a programmatic interface \\nare provided, using machine-readable formats and various exchange protocols. In Figure 2 , a \\nconceptual overview of the infrastructure of Kadi4Mat is presented.\\n2.1 ELECTRONIC LAB NOTEBOOK\\nIn the ELN component, the so-called workflows are of particular importance. A workflow is a', metadata={'source': 'FestbattLiteraturemini/kadi4mat.pdf', 'page': 2, 'relevance_score': 0.004907727}), Document(page_content='plinarity of the field: many seemingly trivial tribological problems require a deep, but still holistic, understanding of processes and mechanisms that act between, at, and underneath contacting surfaces\\n12. A tribological response', metadata={'source': 'FestbattLiteraturemini/s41597-022-01429-9.pdf', 'page': 0, 'relevance_score': 7.9162426e-05}), Document(page_content='alumina crucibles. A photo of the sintered LLZO pellet is shown \\nin the inset of Figure 2a while a low magnification SEM image is shown in Figure S2, Supporting Information. The sintering \\nand pellet pressing conditions were optimized to get the pure', metadata={'source': 'FestbattLiteraturemini/Small - 2020 - Pervez - Overcoming the Interfacial Limitations Imposed by the Solid Solid Interface in Solid‐State.pdf', 'page': 1, 'relevance_score': 7.627518e-05}), Document(page_content='mitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/. © The Author(s) 2022', metadata={'source': 'FestbattLiteraturemini/s41597-022-01429-9.pdf', 'page': 10, 'relevance_score': 7.626042e-05}), Document(page_content='at 1100 °C for 3 h. To get desired dimensions, the pellets were polished with Si-carbide sand paper (grit size 400) under argon environment to obtain a thickness of ≈500 µm and a geometric area of ≈0.785 cm\\n2 for \\neach side.', metadata={'source': 'FestbattLiteraturemini/Small - 2020 - Pervez - Overcoming the Interfacial Limitations Imposed by the Solid Solid Interface in Solid‐State.pdf', 'page': 7, 'relevance_score': 7.619601e-05})]\n", "now reading document\n", "file is located at /tmp/gradio/4067b227cf47cb8a25bd94e77cfd2193637b225e/10.5445IR1000071294.pdf\n", "now creating vectordatabase\n" ] } ], "source": [ "# The rests are for Gradio GUI\n", "\n", "import gradio as gr\n", "import time\n", "from pathlib import Path\n", "\n", "# Gradio utils\n", "def add_text(history, text):\n", " \"\"\"Add conversation to history message.\"\"\"\n", " history = history + [(text, None)]\n", " yield history, \"\"\n", "\n", "\n", "def bot_lisa(history):\n", " \"\"\"Get answer from LLM.\"\"\"\n", " result = lisa_qa_conversation(\n", " {\n", " \"question\": history[-1][0], # or \"query\" if RetrievalQA\n", " \"chat_history\": history[:-1],\n", " }\n", " )\n", " print(f\"Answer: {result['answer']}\")\n", " print(f\"Source document: {result['source_documents']}\") # for debug\n", " # Citation post-processing\n", " answer_text = result[\"answer\"].strip()\n", " history[-1][1] = \"\" # Fake stream, TODO: implement streaming\n", " for character in result[\"answer\"].strip():\n", " time.sleep(0.002)\n", " history[-1][1] += character\n", " yield history, \"citation place holder\"\n", "\n", "\n", "def bot(history, qa_conversation):\n", " \"\"\"Get answer from LLM.\"\"\"\n", " # print(\"id of qa conver\", id(qa_conversation)) # for debug\n", " if qa_conversation is None:\n", " gr.Warning(\"Please upload a document first.\")\n", " \n", " result = qa_conversation(\n", " {\n", " \"question\": history[-1][0], # or \"query\" if RetrievalQA\n", " \"chat_history\": history[:-1],\n", " }\n", " )\n", " print(f\"Source document: {result['source_documents']}\") # for debug\n", " history[-1][1] = \"\" # Fake stream, TODO: implement streaming\n", " for character in result[\"answer\"].strip():\n", " time.sleep(0.002)\n", " history[-1][1] += character\n", " yield history\n", "\n", "\n", "# Ref: https://huggingface.co/spaces/fffiloni/langchain-chat-with-pdf\n", "def document_changes(doc_path):#, repo_id):\n", " if doc_path is None:\n", " gr.Warning(\"Please choose a document first and wait until uploaded.\")\n", " return \"Please choose a document and wait until uploaded.\", None # for langchain_status, qa_conversation\n", " \n", " print(\"now reading document\")\n", " print(f\"file is located at {doc_path[0]}\")\n", " \n", " file_extension = Path(doc_path[0]).suffix\n", " if file_extension == \".pdf\":\n", " pdf_docs = load_pdf_as_docs(doc_path[0])\n", " document_chunks = get_doc_chunks(pdf_docs)\n", " elif file_extension == \".xml\":\n", " raise\n", " # documents = load_xml_as_docs(doc_path[0])\n", " \n", " print(\"now creating vectordatabase\")\n", " \n", " texts = [\"LISA - Lithium Ion Solid-state Assistant\"]\n", " vectorstore = FAISS.from_texts(texts, embeddings)\n", "\n", " store = InMemoryStore()\n", "\n", " parent_splitter = RecursiveCharacterTextSplitter(separators=[\"\\n\\n\", \"\\n\"], chunk_size=512, chunk_overlap=256)\n", " child_splitter = RecursiveCharacterTextSplitter(separators=[\"\\n\\n\", \"\\n\"], chunk_size=256, chunk_overlap=128)\n", "\n", " parent_doc_retriver = ParentDocumentRetriever(\n", " vectorstore=vectorstore,\n", " docstore=store,\n", " child_splitter=child_splitter,\n", " parent_splitter=parent_splitter,\n", " )\n", " parent_doc_retriver.add_documents(pdf_docs)\n", "\n", " bm25_retriever = BM25Retriever.from_documents(document_chunks, k=5) # 1/2 of dense retriever, experimental value\n", "\n", " # Ensemble all above\n", " ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, parent_doc_retriver], weights=[0.5, 0.5])\n", "\n", " compressor = BgeRerank()\n", " rerank_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor, base_retriever=ensemble_retriever\n", " )\n", "\n", " rag_chain = RAGChain()\n", " qa_conversation = rag_chain.create(rerank_retriever, llm)\n", " \n", " print(\"now getting llm model\")\n", " \n", "\n", " file_name = Path(doc_path[0]).name # First file\n", " return f\"Ready for {file_name}\", qa_conversation\n", "\n", "\n", "# Main gradio UI\n", "def main():\n", " # Gradio interface\n", " with gr.Blocks() as demo:\n", " ######################################################################\n", " # LISA chat tab\n", "\n", " # Title info\n", " gr.Markdown(\"## LISA\")\n", " gr.Markdown(\"Q&A system with RAG.\")\n", "\n", " with gr.Tab(\"LISA\"):\n", " # Chatbot\n", " chatbot = gr.Chatbot(\n", " [],\n", " elem_id=\"chatbot\",\n", " label=\"Document Assistant (chat-history context is not supported at the moment, fixing...)\",\n", " bubble_full_width=False,\n", " show_copy_button=True,\n", " likeable=True,\n", " ) # .style(height=750)\n", " with gr.Row():\n", " with gr.Column(scale=80):\n", " user_txt = gr.Textbox(\n", " label=\"Question\",\n", " placeholder=\"Type question and press Enter\",\n", " ) # .style(container=False)\n", " with gr.Column(scale=10):\n", " submit_btn = gr.Button(\"Submit\", variant=\"primary\")\n", " with gr.Column(scale=10):\n", " clear_btn = gr.Button(\"Clear\", variant=\"stop\")\n", " # Reference (citations)\n", " with gr.Accordion(\"Advanced - Document references\", open=False):\n", " doc_citation = gr.Markdown()\n", " # alternative: https://www.gradio.app/guides/creating-a-chatbot-fast\n", " gr.Examples(\n", " examples=[\n", " \"Please name two common solid electrolytes.\",\n", " \"Please name two common oxide solid electrolytes.\",\n", " \"Please tell me what is solid-state battery.\",\n", " \"How to synthesize gc-LPSC?\",\n", " \"Please tell me the purpose of Kadi4Mat.\",\n", " \"Who is working on Kadi4Mat?\",\n", " \"Can you recommend a paper to get a deeper understanding of Kadi4Mat?\",\n", " # \"How to synthesize gc-LPSC, e.g., glass-ceramic Li5.5PS4.5Cl1.5?\",\n", " ],\n", " inputs=user_txt,\n", " outputs=chatbot,\n", " fn=add_text,\n", " # cache_examples=True,\n", " )\n", "\n", " # Manage functions\n", " user_txt.submit(add_text, [chatbot, user_txt], [chatbot, user_txt]).then(\n", " bot_lisa, chatbot, [chatbot, doc_citation]\n", " )\n", "\n", " submit_btn.click(\n", " add_text,\n", " [chatbot, user_txt],\n", " [chatbot, user_txt],\n", " # concurrency_limit=8,\n", " queue=False,\n", " ).then(bot_lisa, chatbot, [chatbot, doc_citation])\n", "\n", " clear_btn.click(lambda: None, None, chatbot, queue=False)\n", "\n", " ######################################################################\n", "\n", " ######################################################################\n", " # Document-based QA\n", "\n", " with gr.Tab(\"Document-based Q&A\"):\n", " qa_conversation = gr.State()\n", " \n", " with gr.Row():\n", " with gr.Column(scale=3, variant=\"load_file_panel\"):\n", " with gr.Row():\n", " gr.HTML(\n", " \"Upload a pdf/xml file, click the Load file button and when everything is ready, you can start asking questions about the document.\"\n", " )\n", " with gr.Row():\n", " uploaded_doc = gr.File(\n", " label=\"Upload pdf/xml file (single)\",\n", " file_count=\"multiple\", # For better looking, but only support 1 file\n", " file_types=[\".pdf\", \".xml\"],\n", " type=\"filepath\",\n", " height=100,\n", " )\n", "\n", " with gr.Row():\n", " langchain_status = gr.Textbox(\n", " label=\"Status\", placeholder=\"\", interactive=False\n", " )\n", " load_document = gr.Button(\"Load file\")\n", "\n", " with gr.Column(scale=7, variant=\"chat_panel\"):\n", " chatbot = gr.Chatbot(\n", " [],\n", " elem_id=\"chatbot\",\n", " # label=\"Document Assistant (chat-history context is not supported at the moment, fixing...)\",\n", " label=\"Document Assistant (chat-history context is not supported at the moment, fixing...)\",\n", " show_copy_button=True,\n", " likeable=True,\n", " ) # .style(height=350)\n", " docqa_question = gr.Textbox(\n", " label=\"Question\",\n", " placeholder=\"Type question and press Enter/click Submit\",\n", " )\n", " with gr.Row():\n", " with gr.Column(scale=50):\n", " docqa_submit_btn = gr.Button(\"Submit\", variant=\"primary\")\n", " with gr.Column(scale=50):\n", " docqa_clear_btn = gr.Button(\"Clear\", variant=\"stop\")\n", " \n", " gr.Examples(\n", " examples=[\n", " \"Summarize the paper\",\n", " \"Summarize the paper in 3 bullet points\",\n", " \"What are the contributions of this paper\",\n", " \"Explain the practical implications of this paper\",\n", " \"Methods used in this paper\",\n", " \"What data has been used in this paper\",\n", " \"Results of the paper\",\n", " \"Conclusions from the paper\",\n", " \"Limitations of this paper\",\n", " \"Future works suggested in this paper\",\n", " ],\n", " inputs=docqa_question,\n", " outputs=chatbot,\n", " fn=add_text,\n", " # cache_examples=True,\n", " )\n", "\n", " load_document.click(\n", " document_changes,\n", " inputs=[uploaded_doc], # , repo_id],\n", " outputs=[langchain_status, qa_conversation],#, docqa_db, docqa_retriever],\n", " queue=False,\n", " )\n", " \n", " docqa_question.submit(add_text, [chatbot, docqa_question], [chatbot, docqa_question]).then(\n", " bot, [chatbot, qa_conversation], chatbot\n", " )\n", " docqa_submit_btn.click(add_text, [chatbot, docqa_question], [chatbot, docqa_question]).then(\n", " bot, [chatbot, qa_conversation], chatbot\n", " )\n", "\n", " gr.Markdown(\"*Notes: The model may produce incorrect statements. Users should treat these outputs as suggestions or starting points, not as definitive or accurate facts.\")\n", "\n", " ######################################################################\n", "\n", " demo.queue().launch(share=True)\n", " \n", " \n", "main()" ] }, { "cell_type": "code", "execution_count": null, "id": "e2864a11", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }