Spaces:

atharvasc27112001
/

medical_chatbot

Running

App Files Files Community

atharvasc27112001 commited on 9 days ago

Commit

e1e0f62

verified ·

1 Parent(s): fdd7edb

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -52

app.py CHANGED Viewed

@@ -1,70 +1,89 @@
 import os
 import pandas as pd
 import torch
 import transformers
-from torch.nn.functional import cosine_similarity
 import gradio as gr
-# ── 1) Constants & Device ────────────────────────────────────────────────
-MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
-MIN_FREQ   = 4
-MAX_LEN    = 256
-DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
-# ── 2) Load & Filter Dataset ─────────────────────────────────────────────
 df = pd.read_csv("medquad.csv")
-df["text"] = df["question"].str.strip() + " " + df["answer"].str.strip()
-vc   = df["focus_area"].value_counts()
 keep = vc[vc >= MIN_FREQ].index
-df   = df[df["focus_area"].isin(keep)].reset_index(drop=True)
 labels   = sorted(df["focus_area"].unique())
 label2id = {lbl:i for i,lbl in enumerate(labels)}
-id2label = {i:l for l,i in label2id.items()}
-# ── 3) Load Tokenizer & Frozen BERT ─────────────────────────────────────
-tokenizer  = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
-bert_model = transformers.AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
 @torch.no_grad()
-def encode_text(s: str, max_length=MAX_LEN):
-    toks   = tokenizer(
-        s,
-        return_tensors="pt",
-        truncation=True,
-        max_length=max_length,
-        padding=False,
-    ).to(DEVICE)
-    hidden = bert_model(**toks).last_hidden_state
-    return hidden[:,0].squeeze().cpu()
-# ── 4) Precompute Static Label Embeddings ─────────────────────────────────
-label_embs = torch.stack([encode_text(lbl, max_length=16) for lbl in labels])
-# ── 5) Classification Function ────────────────────────────────────────────
-def predict_disease(symptoms: str):
-    if not symptoms.strip():
         return "❗️ Please enter your symptoms."
     try:
-        # 1) embed user text → [hidden_size]
-        q_emb = encode_text(symptoms)
-        # 2) compute cosine similarities → [num_labels]
-        sims = cosine_similarity(
-            label_embs,            # [num_labels, hidden_size]
-            q_emb.unsqueeze(0),    # [1, hidden_size]
-            dim=1
-        )
-        # 3) pick the best label index
-        best = sims.argmax().item()
-        return id2label[best]
     except Exception as e:
         return f"Error: {e}"
-# ── 6) Gradio Interface ───────────────────────────────────────────────────
 app = gr.Interface(
     fn=predict_disease,
     inputs=gr.Textbox(
@@ -73,12 +92,10 @@ app = gr.Interface(
     ),
     outputs="text",
     title="🔬 Symptom→Disease Chatbot",
-    description="PubMed-BERT + cosine similarity"
 )
-# ── 7) Launch ─────────────────────────────────────────────────────────────
-app.launch(
-    server_name="0.0.0.0",
-    server_port=int(os.environ.get("PORT", 7860)),
-    share=False
-)

 import os
+import re, random, hashlib
 import pandas as pd
+import numpy as np
 import torch
 import transformers
 import gradio as gr
+from torch import nn
+from torch.nn.functional import cosine_similarity
+# ── Configuration ────────────────────────────────────────────────────────────
+MODEL_NAME      = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
+DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"
+MIN_FREQ        = 4
+MAX_LEN         = 256
+VERBALIZE_LABEL = True
+# ── 1) Load & Clean Data ─────────────────────────────────────────────────────
 df = pd.read_csv("medquad.csv")
+# build text field
+df["text"] = df["question"].fillna("").str.strip() + " " + df["answer"].fillna("").str.strip()
+df = df.dropna(subset=["text"]).reset_index(drop=True)
+# normalize hyphens/spaces in both text and labels
+dash_pat = r"[-‐-–—]"
+df["text"] = df["text"].str.replace(dash_pat, " ", regex=True)
+df["focus_area"] = (
+    df["focus_area"]
+      .fillna("")
+      .astype(str)
+      .str.replace(dash_pat, " ", regex=True)
+      .str.lower()
+      .str.replace(r"\s+", " ", regex=True)
+      .str.strip()
+)
+# prune rare labels
+vc = df["focus_area"].value_counts()
 keep = vc[vc >= MIN_FREQ].index
+df = df[df["focus_area"].isin(keep)].reset_index(drop=True)
+# ── 2) Tokenizer & Frozen BERT ───────────────────────────────────────────────
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
+bert_model = transformers.AutoModel.from_pretrained(MODEL_NAME) \
+                        .to(DEVICE).eval()
+# ── 3) Label ↔ ID maps & label embeddings ────────────────────────────────────
+def verbalise(lbl: str) -> str:
+    if VERBALIZE_LABEL:
+        return f"This question is about the medical focus area of {lbl}."
+    return lbl
 labels   = sorted(df["focus_area"].unique())
 label2id = {lbl:i for i,lbl in enumerate(labels)}
+id2label = {i:lbl for lbl,i in label2id.items()}
 @torch.no_grad()
+def encode_text(s: str, max_length=MAX_LEN) -> torch.Tensor:
+    toks = tokenizer(s, return_tensors="pt",
+                     truncation=True, max_length=max_length,
+                     padding=False).to(DEVICE)
+    out = bert_model(**toks).last_hidden_state[:,0]  # CLS
+    return out.squeeze().cpu()
+# precompute one vector per label
+label_embs = torch.stack([
+    encode_text(verbalise(lbl), max_length=32)
+    for lbl in labels
+])
+# ── 4) Prediction function ──────────────────────────────────────────────────
+def predict_disease(symptoms: str) -> str:
+    symptoms = symptoms.strip()
+    if not symptoms:
         return "❗️ Please enter your symptoms."
     try:
+        # embed user input
+        q_emb = encode_text(symptoms).unsqueeze(0)  # [1, hidden]
+        # cosine with each label embedding
+        sims  = cosine_similarity(label_embs, q_emb, dim=1)  # [num_labels]
+        idx   = sims.argmax().item()
+        return labels[idx]
     except Exception as e:
         return f"Error: {e}"
+# ── 5) Gradio App ───────────────────────────────────────────────────────────
 app = gr.Interface(
     fn=predict_disease,
     inputs=gr.Textbox(
     ),
     outputs="text",
     title="🔬 Symptom→Disease Chatbot",
+    description="PubMed-BERT frozen embeddings + cosine similarity"
 )
+if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0",
+               server_port=int(os.environ.get("PORT", 7860)),
+               share=False)