|
import json |
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
from fastapi import FastAPI, HTTPException |
|
from fastapi.responses import StreamingResponse |
|
from pydantic import BaseModel |
|
from llama_cpp import Llama |
|
from huggingface_hub import login, hf_hub_download |
|
import logging |
|
import os |
|
import faiss |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
app = FastAPI() |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
if not hf_token: |
|
logger.error("HF_TOKEN environment variable not set.") |
|
raise ValueError("HF_TOKEN not set") |
|
login(token=hf_token) |
|
|
|
|
|
sentence_transformer_model = "all-MiniLM-L6-v2" |
|
repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" |
|
filename = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" |
|
|
|
|
|
faqs = [ |
|
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."}, |
|
{"question": "Where do you live?", "answer": "I live in Paris, France."}, |
|
{"question": "What is your education?", "answer": "I am currently pursuing a MSc in Data and AI at Institut Polytechnique de Paris. I also hold an MPhil in Advanced Computer Science from the University of Cambridge and a BSc in Business Informatics from RheinMain University of Applied Sciences."}, |
|
{"question": "What are your skills?", "answer": "I am proficient in Python, Java, SQL, Cypher, SPARQL, VBA, JavaScript, HTML/CSS, and Ruby. I also use tools like PyTorch, Hugging Face, Scikit-Learn, NumPy, Pandas, Matplotlib, Jupyter, Git, Bash, IoT, Ansible, QuickSight, and Wordpress."}, |
|
|
|
] |
|
|
|
try: |
|
|
|
logger.info("Loading CV embeddings from cv_embeddings.json") |
|
with open("cv_embeddings.json", "r", encoding="utf-8") as f: |
|
cv_data = json.load(f) |
|
cv_chunks = [item["chunk"] for item in cv_data] |
|
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32') |
|
faiss.normalize_L2(cv_embeddings) |
|
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1]) |
|
faiss_index.add(cv_embeddings) |
|
logger.info("FAISS index built successfully") |
|
|
|
|
|
logger.info("Loading SentenceTransformer model") |
|
embedder = SentenceTransformer(sentence_transformer_model, device="cpu") |
|
logger.info("SentenceTransformer model loaded") |
|
|
|
|
|
faq_questions = [faq["question"] for faq in faqs] |
|
faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32") |
|
faiss.normalize_L2(faq_embeddings) |
|
|
|
|
|
logger.info(f"Loading {filename} model") |
|
model_path = hf_hub_download( |
|
repo_id=repo_id, |
|
filename=filename, |
|
local_dir="/app/cache" if os.getenv("HF_HOME") else None, |
|
token=hf_token, |
|
) |
|
generator = Llama( |
|
model_path=model_path, |
|
n_ctx=1024, |
|
n_threads=2, |
|
n_batch=512, |
|
n_gpu_layers=0, |
|
verbose=True, |
|
) |
|
logger.info(f"{filename} model loaded") |
|
|
|
except Exception as e: |
|
logger.error(f"Startup error: {str(e)}", exc_info=True) |
|
raise |
|
|
|
def retrieve_context(query, top_k=3): |
|
try: |
|
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32") |
|
query_embedding = query_embedding.reshape(1, -1) |
|
faiss.normalize_L2(query_embedding) |
|
distances, indices = faiss_index.search(query_embedding, top_k) |
|
return "\n".join([cv_chunks[i] for i in indices[0]]) |
|
except Exception as e: |
|
logger.error(f"Error in retrieve_context: {str(e)}") |
|
raise |
|
|
|
def stream_response(query): |
|
try: |
|
logger.info(f"Processing query: {query}") |
|
|
|
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32") |
|
query_embedding = query_embedding.reshape(1, -1) |
|
faiss.normalize_L2(query_embedding) |
|
similarities = np.dot(faq_embeddings, query_embedding.T).flatten() |
|
max_sim = np.max(similarities) |
|
if max_sim > 0.9: |
|
idx = np.argmax(similarities) |
|
yield f"data: {faqs[idx]['answer']}\n\n" |
|
yield "data: [DONE]\n\n" |
|
else: |
|
context = retrieve_context(query) |
|
prompt = ( |
|
f"<|im_start|>system\nYou are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n" |
|
f"A user is asking you a question about your CV. Respond as yourself, using the first person, and base your answer strictly on the information provided in the CV. Do not invent or assume any details not mentioned.\n<|im_end>\n" |
|
f"<|im_start|>user\n{query}\n<|im_end>\n" |
|
f"<|im_start|>assistant\n" |
|
) |
|
for chunk in generator( |
|
prompt, |
|
max_tokens=512, |
|
stream=True, |
|
stop=["<|im_end|>", "[DONE]"], |
|
temperature=0.5, |
|
top_p=0.9, |
|
repeat_penalty=1.1, |
|
): |
|
yield f"data: {chunk['choices'][0]['text']}\n\n" |
|
yield "data: [DONE]\n\n" |
|
except Exception as e: |
|
logger.error(f"Error in stream_response: {str(e)}") |
|
yield f"data: Error: {str(e)}\n\n" |
|
yield "data: [DONE]\n\n" |
|
|
|
class QueryRequest(BaseModel): |
|
data: list |
|
|
|
@app.post("/api/predict") |
|
async def predict(request: QueryRequest): |
|
if not request.data or not isinstance(request.data, list) or len(request.data) < 1: |
|
raise HTTPException(status_code=400, detail="Invalid input: 'data' must be a non-empty list") |
|
query = request.data[0] |
|
return StreamingResponse(stream_response(query), media_type="text/event-stream") |
|
|
|
@app.get("/health") |
|
async def health_check(): |
|
return {"status": "healthy"} |
|
|
|
@app.get("/model_info") |
|
async def model_info(): |
|
return { |
|
"model_name": "Llama-3.2-3B-Instruct-GGUF", |
|
"model_size": "3B", |
|
"embedding_model": sentence_transformer_model, |
|
"faiss_index_size": len(cv_chunks), |
|
"faiss_index_dim": cv_embeddings.shape[1], |
|
} |
|
|
|
@app.on_event("startup") |
|
async def warm_up_model(): |
|
logger.info("Warming up the model...") |
|
dummy_query = "Hi" |
|
for _ in stream_response(dummy_query): |
|
pass |
|
logger.info("Model warm-up complete.") |