File size: 7,965 Bytes
a9aab9d
cd859d1
a9aab9d
68ede67
d03d663
cd859d1
a9aab9d
cd859d1
a9aab9d
 
cd859d1
a9aab9d
cd859d1
 
 
a9aab9d
cd859d1
 
 
a9aab9d
d5376d6
 
f5e32ae
a9aab9d
cd859d1
 
 
a9aab9d
cd859d1
 
 
d6a2f4b
cd859d1
 
 
 
d5376d6
cd859d1
 
a9aab9d
cd859d1
 
 
a9aab9d
ca63b6d
89e06d2
 
 
 
 
 
 
727297a
d5376d6
727297a
89e06d2
ca63b6d
89e06d2
9723601
89e06d2
ca63b6d
c4373fe
a9aab9d
9723601
 
cd859d1
 
a9aab9d
cd859d1
a9aab9d
cd859d1
 
 
 
 
 
 
 
 
 
a9aab9d
 
 
 
 
 
 
413200d
 
 
 
 
a9aab9d
 
 
 
 
 
 
 
 
e2ff40d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9aab9d
d5376d6
ea628b4
d5376d6
 
9d0fc21
d5376d6
9d0fc21
d5376d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d0fc21
d5376d6
 
 
c225f9a
d5376d6
 
 
 
c225f9a
d5376d6
 
 
dd2aaa7
a9aab9d
 
 
23da083
a9aab9d
cd859d1
f5e32ae
cd859d1
a9aab9d
d5376d6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

import os

os.environ["HF_HOME"] = "/tmp/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"

import re
import requests
import hashlib
from tempfile import NamedTemporaryFile
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS  # βœ… FAISS instead of Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.docstore.document import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers.ensemble import EnsembleRetriever

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# === Hugging Face LLaMA 3 Call ===
def generate_response(prompt: str) -> str:
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    payload = {
        "model": "openai/gpt-oss-120b:novita",
        "messages": [
            {"role": "system", "content": "You are a helpful health insurance assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 800
    }

    response = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]

# === Prompt Template ===
template = """You are an expert Health Insurance Policy Assistant.

Your tasks:
1. First, determine whether the user is asking for:
   - a factual explanation (intent-based), or
   - a coverage decision (decision-based).

2. Then:
- If the query is intent-based, answer in **1-2 clear sentences** based on the provided policy excerpt [Detailed reason/Detailed Explanation]. 
- If the query is decision-based, Identify both coverage clause or benefit section , Exclusion or waiting period clause (if any applies) respond in this format:
  [Yes/No] – [Procedure] is [covered/not covered] under [Coverage Clause/Section] and subject to [Exclusion/Waiting Period Clause/Section] because [Detailed reason/Detailed Explanation].

User question: {query}
Policy excerpt: {context}

Your answer:
"""




prompt = ChatPromptTemplate.from_template(template)

# === PDF Utilities ===
def load_remote_pdf(url: str) -> str:
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, stream=True, headers=headers)
    response.raise_for_status()

    content_type = response.headers.get("Content-Type", "")
    if "application/pdf" not in content_type:
        raise ValueError("URL did not return a PDF file.")

    with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        for chunk in response.iter_content(chunk_size=8192):
            tmp.write(chunk)
        return tmp.name

# === Text Cleaner ===
def clean_and_normalize_text(text: str) -> str:
    text = text.replace("Section C.1.", "\n[WAITING_PERIOD]\nSection C.1.")
    text = text.replace("Section C.2.", "\n[STANDARD_EXCLUSIONS]\nSection C.2.")
    text = text.replace("Section C.3.", "\n[SPECIFIC_EXCLUSIONS]\nSection C.3.")
    text = text.replace("Specified disease/procedure waiting period (Excl02)", "\n[EXCL02_SPECIFIC_DISEASE]\nSpecified disease/procedure waiting period (Excl02)")
    text = text.replace("Pre-existing Diseases (Excl01)", "\n[EXCL01_PRE_EXISTING]\nPre-existing Diseases (Excl01)")
    text = text.replace("Room Rent Limit", "\n[ROOM_RENT_LIMIT]\nRoom Rent Limit")
    text = text.replace("Ayush Benefit", "\n[AYUSH_BENEFIT]\nAyush Benefit")
    text = text.replace("Ectopic pregnancy", "\n[EXCEPTION_ECTOPIC]\nEctopic pregnancy")

    text = re.sub(r'\nPage \d+\s*\|.*?\n', '\n', text)
    text = re.sub(r'HDFC ERGO.*?license\.', '', text, flags=re.DOTALL)
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    text = re.sub(r'\n(?=\w)', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n{2,}', '\n\n', text)
    return text.strip()

# === QUERY PREPROCESSOR ===
def query_preprocessor(query: str) -> str:
    import re

    query = query.strip()

    # Patterns
    age_pattern = re.search(r'\b(\d{1,3})\s*(?:yo|year[- ]?old)?\s*[mMfF]?\b', query)
    gender_pattern = re.search(r'\b(?:male|female|[mMfF])\b', query)
    procedure_pattern = re.search(r"(c[- ]?section|caesarean|surgery|dialysis|stroke|cataract|heart attack|delivery|obesity|knee replacement|ayush)", query, re.IGNORECASE)
    location_pattern = re.search(r"in\s+([a-zA-Z\s]+)", query)
    duration_pattern = re.search(r"(\d+)\s*[-]?\s*month", query, re.IGNORECASE)

    # Compose output
    parts = []

    if age_pattern:
        parts.append(f"Age: {age_pattern.group(1)}")
    if gender_pattern:
        gender = gender_pattern.group(0).upper()
        gender = 'Male' if gender.startswith('M') else 'Female'
        parts.append(f"Gender: {gender}")
    if procedure_pattern:
        parts.append(f"Procedure: {procedure_pattern.group(0).strip().title()}")
    if location_pattern:
        parts.append(f"Location: {location_pattern.group(1).strip().title()}")
    if duration_pattern:
        parts.append(f"Policy Duration: {duration_pattern.group(1)} months")

    parts.append(f"Original Query: {query}")
    return ". ".join(parts)

# === Load and process PDF ===
def build_rag_chain(pdf_path: str, rebuild_index=False):
    embeddings = SentenceTransformerEmbeddings(model_name="intfloat/e5-small-v2")
    final_chunks = []  # βœ… ensure it's always defined

    if not rebuild_index and os.path.exists("/tmp/faiss_index"):
        print("πŸ”Ή Loading existing FAISS index...")
        vectorstore = FAISS.load_local("/tmp/faiss_index", embeddings, allow_dangerous_deserialization=True)

        # Also reload chunks for BM25
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        for doc in docs:
            text = clean_and_normalize_text(doc.page_content)
            doc.page_content = text
            final_chunks.append(doc)

    else:
        print("πŸ”Ή Building FAISS index from scratch...")
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()

        for doc in docs:
            text = clean_and_normalize_text(doc.page_content)
            doc.page_content = text
            if "[WAITING_PERIOD]" in text:
                doc.metadata["section"] = "waiting"
            elif "[STANDARD_EXCLUSIONS]" in text:
                doc.metadata["section"] = "standard_exclusion"
            elif "[SPECIFIC_EXCLUSIONS]" in text:
                doc.metadata["section"] = "specific_exclusion"
            elif "Schedule of Benefits" in text:
                doc.metadata["section"] = "schedule"
            else:
                doc.metadata["section"] = "general"

        splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
        for doc in docs:
            splits = splitter.split_text(doc.page_content)
            for chunk_text in splits:
                final_chunks.append(Document(page_content=chunk_text, metadata=doc.metadata))

        vectorstore = FAISS.from_documents(final_chunks, embeddings)
        vectorstore.save_local("/tmp/faiss_index")  # βœ… Save index

    # βœ… Create retrievers
    bm25_retriever = BM25Retriever.from_documents(final_chunks)
    bm25_retriever.k = 5

    retriever = EnsembleRetriever(
        retrievers=[
            bm25_retriever,
            vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 12 , "lambda_mult": 0.5})
        ],
        weights=[0.4, 0.6]
    )

    chain = (
        {
            "context": retriever,
            "query": lambda q: f"Original Query: {q}\n\nPreprocessed Query: {query_preprocessor(q)}"
        }
        | prompt
        | (lambda chat_prompt: generate_response(chat_prompt.to_string()))
        | StrOutputParser()
    )

    return chain