# endpoint.py from fastapi import FastAPI, Request from pydantic import BaseModel from llama_index.core import Settings, StorageContext, load_index_from_storage from llama_index.llms.groq import Groq from llama_index.embeddings.huggingface import HuggingFaceEmbedding import os,json from dotenv import load_dotenv # Load secrets load_dotenv() GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Init LLM and Embedding model Settings.llm = Groq(model="llama3-8b-8192", api_key=GROQ_API_KEY) Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") # Load index PERSIST_DIR = "./storage" storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) index = load_index_from_storage(storage_context) chat_engine = index.as_chat_engine(chat_mode="context", verbose=False) app = FastAPI() class QueryRequest(BaseModel): question: str class RecommendRequest(BaseModel): query: str @app.get("/health") def health_check(): return {"status": "healthy"} @app.post("/recommend") async def recommend(request: RecommendRequest): prompt = f""" You are an intelligent assistant that recommends SHL assessments based on user queries. Using the query: "{request.query}", return **all relevant and matching** SHL assessments (at least 1 and up to 10). Only respond in this exact JSON format: {{ "recommended_assessments": [ {{ "url": "Valid URL in string", "adaptive_support": "Yes/No", "description": "Description in string", "duration": 60, "remote_support": "Yes/No", "test_type": ["List of string"] }} ] }} Do not include any explanations or extra text. Only return pure JSON. Respond with as many matching assessments as possible (up to 10). """ response = chat_engine.chat(prompt) try: return json.loads(response.response) except Exception: return {"error": "Model response was not valid JSON", "raw": response.response}