# from langchain import HuggingFaceHub, LLMChain from langchain.chains import LLMChain from langchain.llms import HuggingFacePipeline from transformers import ( AutoModelForCausalLM, AutoTokenizer, pipeline, T5Tokenizer, T5ForConditionalGeneration, GPT2TokenizerFast, ) from transformers import LlamaForCausalLM, AutoModelForCausalLM, LlamaTokenizer from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, PromptTemplate # model_path = "/mnt/localstorage/yinghan/llm/orca_mini_v3_13b" # model = LlamaForCausalLM.from_pretrained(model_path, device_map="auto")#, load_in_8bit=True) # tokenizer = AutoTokenizer.from_pretrained(model_path) from langchain.chat_models import ChatOpenAI # from langchain_openai import ChatOpenAI # from langchain_openai import ChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import ( CharacterTextSplitter, RecursiveCharacterTextSplitter, ) from langchain.document_loaders import TextLoader, UnstructuredHTMLLoader, PyPDFLoader from langchain.chains.retrieval_qa.base import RetrievalQA from langchain.llms import HuggingFaceHub from dotenv import load_dotenv from langchain.llms import HuggingFaceTextGenInference from langchain.chains.question_answering import load_qa_chain from langchain.chains import ConversationalRetrievalChain from langchain.chains.conversation.memory import ( ConversationBufferMemory, ConversationBufferWindowMemory, ) def get_llm_hf_online(inference_api_url=""): """Get LLM using huggingface inference.""" if not inference_api_url: # default api url inference_api_url = ( "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta" ) llm = HuggingFaceTextGenInference( # cache=None, # Optional: Cache verwenden oder nicht verbose=True, # Provides detailed logs of operation # callbacks=[StreamingStdOutCallbackHandler()], # Handeling Streams max_new_tokens=1024, # Maximum number of token that can be generated. # top_k=2, # Die Anzahl der Top-K Tokens, die beim Generieren berücksichtigt werden sollen top_p=0.95, # Threshold for controlling randomness in text generation process. typical_p=0.95, # temperature=0.1, # For choosing probable words. # repetition_penalty=None, # Wiederholungsstrafe beim Generieren # truncate=None, # Schneidet die Eingabe-Tokens auf die gegebene Größe # stop_sequences=None, # Eine Liste von Stop-Sequenzen beim Generieren inference_server_url=inference_api_url, # URL des Inferenzservers timeout=10, # Timeout for connection with the url # streaming=True, # Streaming the answer ) return llm def get_llm_hf_local(model_path): """Get local LLM.""" # model_path = "/mnt/localstorage/yinghan/llm/orca_mini_v3_13b" # model_path = "/mnt/localstorage/yinghan/llm/zephyr-7b-beta" model = LlamaForCausalLM.from_pretrained( # or AutoModelForCausalLM. TODO: which is better? what's difference? model_path, device_map="auto" ) # , load_in_8bit=True) # model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")#, load_in_8bit=True) # which is better? tokenizer = AutoTokenizer.from_pretrained(model_path) # print('making a pipeline...') # max_length has typically been deprecated for max_new_tokens pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, # need better set model_kwargs={"temperature": 0.1}, # need better set ) llm = HuggingFacePipeline(pipeline=pipe) return llm def get_llm_openai_chat(model_name, inference_server_url, langfuse_callback=None): """Get openai-like LLM.""" # Some defaults # chat_model_name = "openchat/openchat_3.5" # inference_server_url = "http://localhost:8080/v1" llm = ChatOpenAI( model=model_name, openai_api_key="EMPTY", openai_api_base=inference_server_url, max_tokens=1024, # better setting? temperature=0, # default 0.7, better setting? # callbacks=[langfuse_callback], ) # The following is not required for builing normal llm # use the Ragas LangchainLLM wrapper to create a RagasLLM instance # vllm = LangchainLLM(llm=chat) # return vllm return llm def get_chat_vllm(model_name, inference_server_url, langfuse_callback=None): # to fix # Create vLLM Langchain instance # Some defaults # chat_model_name = "openchat/openchat_3.5" # inference_server_url = "http://localhost:8080/v1" chat = ChatOpenAI( model=model_name, openai_api_key="EMPTY", openai_api_base=inference_server_url, max_tokens=512, # better setting? temperature=0.1, # default 0.7, better setting? # callbacks=[langfuse_callback], ) # The following is not required for builing normal llm # use the Ragas LangchainLLM wrapper to create a RagasLLM instance # vllm = LangchainLLM(llm=chat) # return vllm return chat