File size: 7,965 Bytes
a9aab9d cd859d1 a9aab9d 68ede67 d03d663 cd859d1 a9aab9d cd859d1 a9aab9d cd859d1 a9aab9d cd859d1 a9aab9d cd859d1 a9aab9d d5376d6 f5e32ae a9aab9d cd859d1 a9aab9d cd859d1 d6a2f4b cd859d1 d5376d6 cd859d1 a9aab9d cd859d1 a9aab9d ca63b6d 89e06d2 727297a d5376d6 727297a 89e06d2 ca63b6d 89e06d2 9723601 89e06d2 ca63b6d c4373fe a9aab9d 9723601 cd859d1 a9aab9d cd859d1 a9aab9d cd859d1 a9aab9d 413200d a9aab9d e2ff40d a9aab9d d5376d6 ea628b4 d5376d6 9d0fc21 d5376d6 9d0fc21 d5376d6 9d0fc21 d5376d6 c225f9a d5376d6 c225f9a d5376d6 dd2aaa7 a9aab9d 23da083 a9aab9d cd859d1 f5e32ae cd859d1 a9aab9d d5376d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import os
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
import re
import requests
import hashlib
from tempfile import NamedTemporaryFile
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS # β
FAISS instead of Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers.ensemble import EnsembleRetriever
# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# === Hugging Face LLaMA 3 Call ===
def generate_response(prompt: str) -> str:
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
payload = {
"model": "openai/gpt-oss-120b:novita",
"messages": [
{"role": "system", "content": "You are a helpful health insurance assistant."},
{"role": "user", "content": prompt}
],
"max_tokens": 800
}
response = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, json=payload)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
# === Prompt Template ===
template = """You are an expert Health Insurance Policy Assistant.
Your tasks:
1. First, determine whether the user is asking for:
- a factual explanation (intent-based), or
- a coverage decision (decision-based).
2. Then:
- If the query is intent-based, answer in **1-2 clear sentences** based on the provided policy excerpt [Detailed reason/Detailed Explanation].
- If the query is decision-based, Identify both coverage clause or benefit section , Exclusion or waiting period clause (if any applies) respond in this format:
[Yes/No] β [Procedure] is [covered/not covered] under [Coverage Clause/Section] and subject to [Exclusion/Waiting Period Clause/Section] because [Detailed reason/Detailed Explanation].
User question: {query}
Policy excerpt: {context}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(template)
# === PDF Utilities ===
def load_remote_pdf(url: str) -> str:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status()
content_type = response.headers.get("Content-Type", "")
if "application/pdf" not in content_type:
raise ValueError("URL did not return a PDF file.")
with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
for chunk in response.iter_content(chunk_size=8192):
tmp.write(chunk)
return tmp.name
# === Text Cleaner ===
def clean_and_normalize_text(text: str) -> str:
text = text.replace("Section C.1.", "\n[WAITING_PERIOD]\nSection C.1.")
text = text.replace("Section C.2.", "\n[STANDARD_EXCLUSIONS]\nSection C.2.")
text = text.replace("Section C.3.", "\n[SPECIFIC_EXCLUSIONS]\nSection C.3.")
text = text.replace("Specified disease/procedure waiting period (Excl02)", "\n[EXCL02_SPECIFIC_DISEASE]\nSpecified disease/procedure waiting period (Excl02)")
text = text.replace("Pre-existing Diseases (Excl01)", "\n[EXCL01_PRE_EXISTING]\nPre-existing Diseases (Excl01)")
text = text.replace("Room Rent Limit", "\n[ROOM_RENT_LIMIT]\nRoom Rent Limit")
text = text.replace("Ayush Benefit", "\n[AYUSH_BENEFIT]\nAyush Benefit")
text = text.replace("Ectopic pregnancy", "\n[EXCEPTION_ECTOPIC]\nEctopic pregnancy")
text = re.sub(r'\nPage \d+\s*\|.*?\n', '\n', text)
text = re.sub(r'HDFC ERGO.*?license\.', '', text, flags=re.DOTALL)
text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
text = re.sub(r'\n(?=\w)', ' ', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{2,}', '\n\n', text)
return text.strip()
# === QUERY PREPROCESSOR ===
def query_preprocessor(query: str) -> str:
import re
query = query.strip()
# Patterns
age_pattern = re.search(r'\b(\d{1,3})\s*(?:yo|year[- ]?old)?\s*[mMfF]?\b', query)
gender_pattern = re.search(r'\b(?:male|female|[mMfF])\b', query)
procedure_pattern = re.search(r"(c[- ]?section|caesarean|surgery|dialysis|stroke|cataract|heart attack|delivery|obesity|knee replacement|ayush)", query, re.IGNORECASE)
location_pattern = re.search(r"in\s+([a-zA-Z\s]+)", query)
duration_pattern = re.search(r"(\d+)\s*[-]?\s*month", query, re.IGNORECASE)
# Compose output
parts = []
if age_pattern:
parts.append(f"Age: {age_pattern.group(1)}")
if gender_pattern:
gender = gender_pattern.group(0).upper()
gender = 'Male' if gender.startswith('M') else 'Female'
parts.append(f"Gender: {gender}")
if procedure_pattern:
parts.append(f"Procedure: {procedure_pattern.group(0).strip().title()}")
if location_pattern:
parts.append(f"Location: {location_pattern.group(1).strip().title()}")
if duration_pattern:
parts.append(f"Policy Duration: {duration_pattern.group(1)} months")
parts.append(f"Original Query: {query}")
return ". ".join(parts)
# === Load and process PDF ===
def build_rag_chain(pdf_path: str, rebuild_index=False):
embeddings = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")
final_chunks = [] # β
ensure it's always defined
if not rebuild_index and os.path.exists("/tmp/faiss_index"):
print("πΉ Loading existing FAISS index...")
vectorstore = FAISS.load_local("/tmp/faiss_index", embeddings, allow_dangerous_deserialization=True)
# Also reload chunks for BM25
loader = PyPDFLoader(pdf_path)
docs = loader.load()
for doc in docs:
text = clean_and_normalize_text(doc.page_content)
doc.page_content = text
final_chunks.append(doc)
else:
print("πΉ Building FAISS index from scratch...")
loader = PyPDFLoader(pdf_path)
docs = loader.load()
for doc in docs:
text = clean_and_normalize_text(doc.page_content)
doc.page_content = text
if "[WAITING_PERIOD]" in text:
doc.metadata["section"] = "waiting"
elif "[STANDARD_EXCLUSIONS]" in text:
doc.metadata["section"] = "standard_exclusion"
elif "[SPECIFIC_EXCLUSIONS]" in text:
doc.metadata["section"] = "specific_exclusion"
elif "Schedule of Benefits" in text:
doc.metadata["section"] = "schedule"
else:
doc.metadata["section"] = "general"
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
for doc in docs:
splits = splitter.split_text(doc.page_content)
for chunk_text in splits:
final_chunks.append(Document(page_content=chunk_text, metadata=doc.metadata))
vectorstore = FAISS.from_documents(final_chunks, embeddings)
vectorstore.save_local("/tmp/faiss_index") # β
Save index
# β
Create retrievers
bm25_retriever = BM25Retriever.from_documents(final_chunks)
bm25_retriever.k = 5
retriever = EnsembleRetriever(
retrievers=[
bm25_retriever,
vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 12 , "lambda_mult": 0.5})
],
weights=[0.4, 0.6]
)
chain = (
{
"context": retriever,
"query": lambda q: f"Original Query: {q}\n\nPreprocessed Query: {query_preprocessor(q)}"
}
| prompt
| (lambda chat_prompt: generate_response(chat_prompt.to_string()))
| StrOutputParser()
)
return chain
|