Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
|
5 |
+
from RadEval import RadEval
|
6 |
+
|
7 |
+
def run_radeval_simple(ref_text, hyp_text, selected_metrics):
|
8 |
+
"""
|
9 |
+
Run RadEval with selected metrics on a pair of reference and hypothesis texts
|
10 |
+
"""
|
11 |
+
try:
|
12 |
+
|
13 |
+
refs = [ref_text.strip()]
|
14 |
+
hyps = [hyp_text.strip()]
|
15 |
+
|
16 |
+
# Configure RadEval based on selected metrics
|
17 |
+
config = {
|
18 |
+
'do_radgraph': 'RadGraph F1' in selected_metrics,
|
19 |
+
'do_bleu': 'BLEU' in selected_metrics,
|
20 |
+
'do_rouge': 'ROUGE' in selected_metrics,
|
21 |
+
'do_bertscore': 'BERTScore' in selected_metrics,
|
22 |
+
'do_chexbert': 'CheXbert F1' in selected_metrics,
|
23 |
+
'do_ratescore': 'RaTEScore' in selected_metrics,
|
24 |
+
'do_radcliq': 'RadCliQ' in selected_metrics,
|
25 |
+
'do_temporal': 'Temporal F1' in selected_metrics,
|
26 |
+
'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
|
27 |
+
'do_green': 'GREEN' in selected_metrics,
|
28 |
+
'do_srr_bert': 'SRR-BERT' in selected_metrics
|
29 |
+
}
|
30 |
+
|
31 |
+
# Initialize RadEval with selected metrics
|
32 |
+
evaluator = RadEval(**config)
|
33 |
+
|
34 |
+
# Run evaluation
|
35 |
+
results = evaluator(refs=refs, hyps=hyps)
|
36 |
+
|
37 |
+
# Prepare results for display
|
38 |
+
table_data = []
|
39 |
+
analysis_text = "## RadEval Results\n\n"
|
40 |
+
analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
|
41 |
+
analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
|
42 |
+
analysis_text += "### Evaluation Scores:\n\n"
|
43 |
+
|
44 |
+
for metric, score in results.items():
|
45 |
+
if isinstance(score, (int, float)):
|
46 |
+
formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
|
47 |
+
table_data.append([metric, formatted_score])
|
48 |
+
analysis_text += f"- **{metric}**: {formatted_score}\n"
|
49 |
+
elif isinstance(score, dict):
|
50 |
+
# Handle nested metrics
|
51 |
+
for sub_metric, sub_score in score.items():
|
52 |
+
if isinstance(sub_score, (int, float)):
|
53 |
+
formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
|
54 |
+
metric_name = f"{metric}_{sub_metric}"
|
55 |
+
table_data.append([metric_name, formatted_score])
|
56 |
+
analysis_text += f"- **{metric_name}**: {formatted_score}\n"
|
57 |
+
|
58 |
+
if not table_data:
|
59 |
+
return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
|
60 |
+
|
61 |
+
return analysis_text, table_data
|
62 |
+
|
63 |
+
except ImportError as e:
|
64 |
+
error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
|
65 |
+
return error_msg, [["Error", error_msg]]
|
66 |
+
except Exception as e:
|
67 |
+
error_msg = f"Evaluation Error: {str(e)}"
|
68 |
+
return error_msg, [["Error", error_msg]]
|
69 |
+
|
70 |
+
|
71 |
+
# Example pairs for radiology reports
|
72 |
+
examples = {
|
73 |
+
"Normal vs Normal": {
|
74 |
+
"ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
|
75 |
+
"hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
|
76 |
+
},
|
77 |
+
"Pneumonia Case": {
|
78 |
+
"ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
|
79 |
+
"hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
|
80 |
+
},
|
81 |
+
"Temporal Comparison": {
|
82 |
+
"ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
|
83 |
+
"hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
|
84 |
+
},
|
85 |
+
"Discordant Reports": {
|
86 |
+
"ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
|
87 |
+
"hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
|
88 |
+
},
|
89 |
+
"Ambiguous Language": {
|
90 |
+
"ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
|
91 |
+
"hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
|
92 |
+
},
|
93 |
+
"Surgical Follow-up": {
|
94 |
+
"ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
|
95 |
+
"hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
|
96 |
+
},
|
97 |
+
"False Positive": {
|
98 |
+
"ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
|
99 |
+
"hyp": "Right lower lobe consolidation concerning for pneumonia.",
|
100 |
+
},
|
101 |
+
"Textual Hallucination": {
|
102 |
+
"ref": "Heart and mediastinum are normal. Lungs are clear.",
|
103 |
+
"hyp": "Large left pleural effusion with mediastinal shift to the right.",
|
104 |
+
},
|
105 |
+
"Negation Challenge": {
|
106 |
+
"ref": "No evidence of pneumothorax or pleural effusion.",
|
107 |
+
"hyp": "Evidence of small pneumothorax on the right.",
|
108 |
+
},
|
109 |
+
"Fine-grained Difference": {
|
110 |
+
"ref": "Mild interstitial markings at the lung bases, likely chronic.",
|
111 |
+
"hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
|
112 |
+
}
|
113 |
+
}
|
114 |
+
|
115 |
+
def update_fields(choice):
|
116 |
+
"""Update text fields based on example selection"""
|
117 |
+
if choice == "Custom":
|
118 |
+
return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
|
119 |
+
else:
|
120 |
+
return (
|
121 |
+
gr.update(value=examples[choice]["ref"], interactive=False),
|
122 |
+
gr.update(value=examples[choice]["hyp"], interactive=False)
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
# Available metrics (ordered by computational complexity)
|
127 |
+
available_metrics = [
|
128 |
+
"BLEU",
|
129 |
+
"ROUGE",
|
130 |
+
"BERTScore",
|
131 |
+
"Temporal F1",
|
132 |
+
"RadEval BERTScore",
|
133 |
+
"RaTEScore",
|
134 |
+
"RadCliQ",
|
135 |
+
"SRR-BERT",
|
136 |
+
"CheXbert F1",
|
137 |
+
"RadGraph F1",
|
138 |
+
"GREEN"
|
139 |
+
]
|
140 |
+
|
141 |
+
# Fast metrics for default selection
|
142 |
+
default_metrics = ["BLEU", "ROUGE", "BERTScore"]
|
143 |
+
|
144 |
+
|
145 |
+
with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
|
146 |
+
gr.Markdown(
|
147 |
+
"""
|
148 |
+
# π©Ί RadEval: A framework for radiology text evaluation
|
149 |
+
[Github](https://pypi.org/project/RadEval/) | [PyPI]() | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
|
150 |
+
|
151 |
+
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
|
152 |
+
|
153 |
+
**β οΈ Performance Warning β οΈ**
|
154 |
+
|
155 |
+
The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
|
156 |
+
"""
|
157 |
+
)
|
158 |
+
|
159 |
+
with gr.Row():
|
160 |
+
choice = gr.Radio(
|
161 |
+
label="π Choose Example or Custom Input",
|
162 |
+
choices=["Custom"] + list(examples.keys()),
|
163 |
+
value="Custom",
|
164 |
+
interactive=True
|
165 |
+
)
|
166 |
+
|
167 |
+
with gr.Row():
|
168 |
+
with gr.Column(scale=1):
|
169 |
+
ref_input = gr.Textbox(
|
170 |
+
label="π Reference Report (Ground Truth)",
|
171 |
+
lines=5,
|
172 |
+
placeholder="Enter the reference radiology report here...",
|
173 |
+
info="The ground truth or expert-written report"
|
174 |
+
)
|
175 |
+
with gr.Column(scale=1):
|
176 |
+
hyp_input = gr.Textbox(
|
177 |
+
label="π€ Hypothesis Report (Generated)",
|
178 |
+
lines=5,
|
179 |
+
placeholder="Enter the generated/predicted radiology report here...",
|
180 |
+
info="The AI-generated or system-produced report"
|
181 |
+
)
|
182 |
+
|
183 |
+
choice.change(
|
184 |
+
update_fields,
|
185 |
+
inputs=choice,
|
186 |
+
outputs=[ref_input, hyp_input],
|
187 |
+
)
|
188 |
+
|
189 |
+
with gr.Row():
|
190 |
+
metrics_selection = gr.CheckboxGroup(
|
191 |
+
label="π― Select Evaluation Metrics",
|
192 |
+
choices=available_metrics,
|
193 |
+
value=default_metrics,
|
194 |
+
interactive=True,
|
195 |
+
info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
|
196 |
+
)
|
197 |
+
|
198 |
+
with gr.Row():
|
199 |
+
run_button = gr.Button("π Run RadEval", variant="primary", size="lg")
|
200 |
+
|
201 |
+
with gr.Row():
|
202 |
+
with gr.Column(scale=2):
|
203 |
+
analysis_output = gr.Markdown(
|
204 |
+
value="π **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
|
205 |
+
)
|
206 |
+
with gr.Column(scale=1):
|
207 |
+
table_output = gr.DataFrame(
|
208 |
+
label="π Detailed Scores",
|
209 |
+
headers=["Metric", "Score"],
|
210 |
+
wrap=True
|
211 |
+
)
|
212 |
+
|
213 |
+
# Information section
|
214 |
+
with gr.Accordion("π‘ Metric Information", open=False):
|
215 |
+
gr.Markdown(
|
216 |
+
"""
|
217 |
+
### π Available Metrics:
|
218 |
+
|
219 |
+
**Traditional NLG Metrics:**
|
220 |
+
- **BLEU**: N-gram overlap between reference and hypothesis
|
221 |
+
- **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
|
222 |
+
- **BERTScore**: Semantic similarity using BERT embeddings
|
223 |
+
|
224 |
+
**Radiology-Specific Metrics:**
|
225 |
+
- **RadGraph F1**: Entity and relation extraction for radiology
|
226 |
+
- **CheXbert F1**: Chest X-ray finding classification performance
|
227 |
+
- **RaTEScore**: Radiology-aware text evaluation score
|
228 |
+
- **RadCliQ**: Composite metric for radiology reports
|
229 |
+
- **Temporal F1**: Temporal entity and relationship evaluation
|
230 |
+
- **RadEval BERTScore**: Specialized BERT for radiology text
|
231 |
+
- **GREEN**: Generative evaluation with natural language explanations
|
232 |
+
- **SRR-BERT**: Structured radiology reasoning evaluation
|
233 |
+
|
234 |
+
### β‘ Performance Notes:
|
235 |
+
- **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
|
236 |
+
- **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
|
237 |
+
- **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
|
238 |
+
"""
|
239 |
+
)
|
240 |
+
|
241 |
+
run_button.click(
|
242 |
+
run_radeval_simple,
|
243 |
+
inputs=[ref_input, hyp_input, metrics_selection],
|
244 |
+
outputs=[analysis_output, table_output]
|
245 |
+
)
|
246 |
+
|
247 |
+
if __name__ == "__main__":
|
248 |
+
demo.launch()
|