Spaces:

X-iZhang
/

RadEval

Running

App Files Files Community

X-iZhang commited on about 1 month ago

Commit

c5b8c18

verified ·

1 Parent(s): 093cf6f

Upload app.py

Browse files

Files changed (1) hide show

app.py +248 -0

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import gradio as gr
+import sys
+import os
+from RadEval import RadEval
+def run_radeval_simple(ref_text, hyp_text, selected_metrics):
+    """
+    Run RadEval with selected metrics on a pair of reference and hypothesis texts
+    """
+    try:
+        refs = [ref_text.strip()]
+        hyps = [hyp_text.strip()]
+        # Configure RadEval based on selected metrics
+        config = {
+            'do_radgraph': 'RadGraph F1' in selected_metrics,
+            'do_bleu': 'BLEU' in selected_metrics,
+            'do_rouge': 'ROUGE' in selected_metrics,
+            'do_bertscore': 'BERTScore' in selected_metrics,
+            'do_chexbert': 'CheXbert F1' in selected_metrics,
+            'do_ratescore': 'RaTEScore' in selected_metrics,
+            'do_radcliq': 'RadCliQ' in selected_metrics,
+            'do_temporal': 'Temporal F1' in selected_metrics,
+            'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
+            'do_green': 'GREEN' in selected_metrics,
+            'do_srr_bert': 'SRR-BERT' in selected_metrics
+        }
+        # Initialize RadEval with selected metrics
+        evaluator = RadEval(**config)
+        # Run evaluation
+        results = evaluator(refs=refs, hyps=hyps)
+        # Prepare results for display
+        table_data = []
+        analysis_text = "## RadEval Results\n\n"
+        analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
+        analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
+        analysis_text += "### Evaluation Scores:\n\n"
+        for metric, score in results.items():
+            if isinstance(score, (int, float)):
+                formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
+                table_data.append([metric, formatted_score])
+                analysis_text += f"- **{metric}**: {formatted_score}\n"
+            elif isinstance(score, dict):
+                # Handle nested metrics
+                for sub_metric, sub_score in score.items():
+                    if isinstance(sub_score, (int, float)):
+                        formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
+                        metric_name = f"{metric}_{sub_metric}"
+                        table_data.append([metric_name, formatted_score])
+                        analysis_text += f"- **{metric_name}**: {formatted_score}\n"
+        if not table_data:
+            return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
+        return analysis_text, table_data
+    except ImportError as e:
+        error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
+        return error_msg, [["Error", error_msg]]
+    except Exception as e:
+        error_msg = f"Evaluation Error: {str(e)}"
+        return error_msg, [["Error", error_msg]]
+# Example pairs for radiology reports
+examples = {
+    "Normal vs Normal": {
+        "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
+        "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
+    },
+    "Pneumonia Case": {
+        "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
+        "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
+    },
+    "Temporal Comparison": {
+        "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
+        "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
+    },
+    "Discordant Reports": {
+        "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
+        "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
+    },
+    "Ambiguous Language": {
+        "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
+        "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
+    },
+    "Surgical Follow-up": {
+        "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
+        "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
+    },
+    "False Positive": {
+        "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
+        "hyp": "Right lower lobe consolidation concerning for pneumonia.",
+    },
+    "Textual Hallucination": {
+        "ref": "Heart and mediastinum are normal. Lungs are clear.",
+        "hyp": "Large left pleural effusion with mediastinal shift to the right.",
+    },
+    "Negation Challenge": {
+        "ref": "No evidence of pneumothorax or pleural effusion.",
+        "hyp": "Evidence of small pneumothorax on the right.",
+    },
+    "Fine-grained Difference": {
+        "ref": "Mild interstitial markings at the lung bases, likely chronic.",
+        "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
+    }
+}
+def update_fields(choice):
+    """Update text fields based on example selection"""
+    if choice == "Custom":
+        return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
+    else:
+        return (
+            gr.update(value=examples[choice]["ref"], interactive=False),
+            gr.update(value=examples[choice]["hyp"], interactive=False)
+        )
+# Available metrics (ordered by computational complexity)
+available_metrics = [
+    "BLEU",
+    "ROUGE",
+    "BERTScore",
+    "Temporal F1",
+    "RadEval BERTScore",
+    "RaTEScore",
+    "RadCliQ",
+    "SRR-BERT",
+    "CheXbert F1",
+    "RadGraph F1",
+    "GREEN"
+]
+# Fast metrics for default selection
+default_metrics = ["BLEU", "ROUGE", "BERTScore"]
+with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🩺 RadEval:  A framework for radiology text evaluation
+        [Github](https://pypi.org/project/RadEval/) | [PyPI]() | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
+        **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
+        **⚠️ Performance Warning ⚠️**
+        The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
+        """
+    )
+    with gr.Row():
+        choice = gr.Radio(
+            label="📋 Choose Example or Custom Input",
+            choices=["Custom"] + list(examples.keys()),
+            value="Custom",
+            interactive=True
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            ref_input = gr.Textbox(
+                label="📄 Reference Report (Ground Truth)",
+                lines=5,
+                placeholder="Enter the reference radiology report here...",
+                info="The ground truth or expert-written report"
+            )
+        with gr.Column(scale=1):
+            hyp_input = gr.Textbox(
+                label="🤖 Hypothesis Report (Generated)",
+                lines=5,
+                placeholder="Enter the generated/predicted radiology report here...",
+                info="The AI-generated or system-produced report"
+            )
+    choice.change(
+        update_fields,
+        inputs=choice,
+        outputs=[ref_input, hyp_input],
+    )
+    with gr.Row():
+        metrics_selection = gr.CheckboxGroup(
+            label="🎯 Select Evaluation Metrics",
+            choices=available_metrics,
+            value=default_metrics,
+            interactive=True,
+            info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
+        )
+    with gr.Row():
+        run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg")
+    with gr.Row():
+        with gr.Column(scale=2):
+            analysis_output = gr.Markdown(
+                value="📊 **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
+            )
+        with gr.Column(scale=1):
+            table_output = gr.DataFrame(
+                label="📈 Detailed Scores",
+                headers=["Metric", "Score"],
+                wrap=True
+            )
+    # Information section
+    with gr.Accordion("💡 Metric Information", open=False):
+        gr.Markdown(
+            """
+            ### 📊 Available Metrics:
+            **Traditional NLG Metrics:**
+            - **BLEU**: N-gram overlap between reference and hypothesis
+            - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
+            - **BERTScore**: Semantic similarity using BERT embeddings
+            **Radiology-Specific Metrics:**
+            - **RadGraph F1**: Entity and relation extraction for radiology
+            - **CheXbert F1**: Chest X-ray finding classification performance
+            - **RaTEScore**: Radiology-aware text evaluation score
+            - **RadCliQ**: Composite metric for radiology reports
+            - **Temporal F1**: Temporal entity and relationship evaluation
+            - **RadEval BERTScore**: Specialized BERT for radiology text
+            - **GREEN**: Generative evaluation with natural language explanations
+            - **SRR-BERT**: Structured radiology reasoning evaluation
+            ### ⚡ Performance Notes:
+            - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
+            - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
+            - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
+            """
+        )
+    run_button.click(
+        run_radeval_simple,
+        inputs=[ref_input, hyp_input, metrics_selection],
+        outputs=[analysis_output, table_output]
+    )
+if __name__ == "__main__":
+    demo.launch()