|
import os |
|
import hashlib |
|
import uvicorn |
|
from fastapi import FastAPI, Request |
|
from fastapi.responses import JSONResponse |
|
from langchain.llms import VLLM |
|
from gptcache import Cache |
|
from gptcache.manager.factory import manager_factory |
|
from gptcache.processor.pre import get_prompt |
|
from langchain_community.callbacks.manager import get_openai_callback |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
import langchain |
|
import spaces |
|
|
|
app = FastAPI() |
|
|
|
def get_hashed_name(name): |
|
return hashlib.sha256(name.encode()).hexdigest() |
|
|
|
def init_gptcache(cache_obj, llm): |
|
hashed_llm = get_hashed_name(llm) |
|
cache_obj.init(pre_embedding_func=get_prompt, data_manager=manager_factory(manager="map", data_dir=f"map_cache_{hashed_llm}")) |
|
|
|
cache = Cache() |
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
|
llm_models = { |
|
"yi-coder": VLLM(model="01-ai/Yi-Coder-1.5B", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.6, use_auth_token=hf_token, device="cpu"), |
|
"llama": VLLM(model="meta-llama/Llama-3.2-3B-Instruct", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.1, use_auth_token=hf_token, device="cpu"), |
|
"qwen": VLLM(model="Qwen/Qwen2.5-1.5B-Instruct", trust_remote_code=True, use_cuda=False, max_new_tokens=50, temperature=0.6, use_auth_token=hf_token, device="cpu"), |
|
} |
|
|
|
for llm_name, llm in llm_models.items(): |
|
init_gptcache(cache, llm_name) |
|
|
|
langchain.llm_cache = langchain.cache.GPTCache(session=cache) |
|
|
|
try: |
|
sentence_model = SentenceTransformer('all-mpnet-base-v2', device='cpu') |
|
except Exception as e: |
|
print(f"Error loading SentenceTransformer: {e}") |
|
sentence_model = None |
|
|
|
@app.get("/") |
|
def read_root(): |
|
return {"Hello": "World"} |
|
|
|
@app.post("/v1/generateText") |
|
@spaces.GPU() |
|
async def generateText(request: Request): |
|
request_dict = await request.json() |
|
prompt = request_dict.pop("prompt") |
|
max_tokens = request_dict.get("max_tokens", -1) |
|
|
|
all_responses = {} |
|
for model_name, llm in llm_models.items(): |
|
try: |
|
with get_openai_callback() as cb: |
|
if max_tokens == -1: |
|
full_response = llm(prompt) |
|
else: |
|
full_response = "" |
|
current_prompt = prompt |
|
while True: |
|
response_part = llm(current_prompt, max_new_tokens=max_tokens) |
|
full_response += response_part |
|
if len(full_response) >= max_tokens or response_part == "": |
|
break |
|
current_prompt = full_response |
|
print(cb) |
|
all_responses[model_name] = full_response |
|
print(f"Model {model_name}: {full_response}") |
|
except Exception as e: |
|
print(f"Error with model {model_name}: {e}") |
|
|
|
if not all_responses: |
|
return JSONResponse({"error": "All models failed to generate text"}, status_code=500) |
|
|
|
if sentence_model: |
|
embeddings = sentence_model.encode(list(all_responses.values())) |
|
similarities = cosine_similarity(embeddings) |
|
avg_similarity = similarities.mean(axis=0) |
|
best_model = list(all_responses.keys())[avg_similarity.argmax()] |
|
best_response = all_responses[best_model] |
|
else: |
|
best_model = list(all_responses.keys())[0] |
|
best_response = all_responses[best_model] |
|
|
|
return JSONResponse({"best_model": best_model, "text": best_response, "all_responses": all_responses}) |
|
|
|
if __name__ == "__main__": |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |