X-iZhang commited on
Commit
c5b8c18
Β·
verified Β·
1 Parent(s): 093cf6f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -0
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sys
3
+ import os
4
+
5
+ from RadEval import RadEval
6
+
7
+ def run_radeval_simple(ref_text, hyp_text, selected_metrics):
8
+ """
9
+ Run RadEval with selected metrics on a pair of reference and hypothesis texts
10
+ """
11
+ try:
12
+
13
+ refs = [ref_text.strip()]
14
+ hyps = [hyp_text.strip()]
15
+
16
+ # Configure RadEval based on selected metrics
17
+ config = {
18
+ 'do_radgraph': 'RadGraph F1' in selected_metrics,
19
+ 'do_bleu': 'BLEU' in selected_metrics,
20
+ 'do_rouge': 'ROUGE' in selected_metrics,
21
+ 'do_bertscore': 'BERTScore' in selected_metrics,
22
+ 'do_chexbert': 'CheXbert F1' in selected_metrics,
23
+ 'do_ratescore': 'RaTEScore' in selected_metrics,
24
+ 'do_radcliq': 'RadCliQ' in selected_metrics,
25
+ 'do_temporal': 'Temporal F1' in selected_metrics,
26
+ 'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
27
+ 'do_green': 'GREEN' in selected_metrics,
28
+ 'do_srr_bert': 'SRR-BERT' in selected_metrics
29
+ }
30
+
31
+ # Initialize RadEval with selected metrics
32
+ evaluator = RadEval(**config)
33
+
34
+ # Run evaluation
35
+ results = evaluator(refs=refs, hyps=hyps)
36
+
37
+ # Prepare results for display
38
+ table_data = []
39
+ analysis_text = "## RadEval Results\n\n"
40
+ analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
41
+ analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
42
+ analysis_text += "### Evaluation Scores:\n\n"
43
+
44
+ for metric, score in results.items():
45
+ if isinstance(score, (int, float)):
46
+ formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
47
+ table_data.append([metric, formatted_score])
48
+ analysis_text += f"- **{metric}**: {formatted_score}\n"
49
+ elif isinstance(score, dict):
50
+ # Handle nested metrics
51
+ for sub_metric, sub_score in score.items():
52
+ if isinstance(sub_score, (int, float)):
53
+ formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
54
+ metric_name = f"{metric}_{sub_metric}"
55
+ table_data.append([metric_name, formatted_score])
56
+ analysis_text += f"- **{metric_name}**: {formatted_score}\n"
57
+
58
+ if not table_data:
59
+ return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
60
+
61
+ return analysis_text, table_data
62
+
63
+ except ImportError as e:
64
+ error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
65
+ return error_msg, [["Error", error_msg]]
66
+ except Exception as e:
67
+ error_msg = f"Evaluation Error: {str(e)}"
68
+ return error_msg, [["Error", error_msg]]
69
+
70
+
71
+ # Example pairs for radiology reports
72
+ examples = {
73
+ "Normal vs Normal": {
74
+ "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
75
+ "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
76
+ },
77
+ "Pneumonia Case": {
78
+ "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
79
+ "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
80
+ },
81
+ "Temporal Comparison": {
82
+ "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
83
+ "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
84
+ },
85
+ "Discordant Reports": {
86
+ "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
87
+ "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
88
+ },
89
+ "Ambiguous Language": {
90
+ "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
91
+ "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
92
+ },
93
+ "Surgical Follow-up": {
94
+ "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
95
+ "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
96
+ },
97
+ "False Positive": {
98
+ "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
99
+ "hyp": "Right lower lobe consolidation concerning for pneumonia.",
100
+ },
101
+ "Textual Hallucination": {
102
+ "ref": "Heart and mediastinum are normal. Lungs are clear.",
103
+ "hyp": "Large left pleural effusion with mediastinal shift to the right.",
104
+ },
105
+ "Negation Challenge": {
106
+ "ref": "No evidence of pneumothorax or pleural effusion.",
107
+ "hyp": "Evidence of small pneumothorax on the right.",
108
+ },
109
+ "Fine-grained Difference": {
110
+ "ref": "Mild interstitial markings at the lung bases, likely chronic.",
111
+ "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
112
+ }
113
+ }
114
+
115
+ def update_fields(choice):
116
+ """Update text fields based on example selection"""
117
+ if choice == "Custom":
118
+ return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
119
+ else:
120
+ return (
121
+ gr.update(value=examples[choice]["ref"], interactive=False),
122
+ gr.update(value=examples[choice]["hyp"], interactive=False)
123
+ )
124
+
125
+
126
+ # Available metrics (ordered by computational complexity)
127
+ available_metrics = [
128
+ "BLEU",
129
+ "ROUGE",
130
+ "BERTScore",
131
+ "Temporal F1",
132
+ "RadEval BERTScore",
133
+ "RaTEScore",
134
+ "RadCliQ",
135
+ "SRR-BERT",
136
+ "CheXbert F1",
137
+ "RadGraph F1",
138
+ "GREEN"
139
+ ]
140
+
141
+ # Fast metrics for default selection
142
+ default_metrics = ["BLEU", "ROUGE", "BERTScore"]
143
+
144
+
145
+ with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
146
+ gr.Markdown(
147
+ """
148
+ # 🩺 RadEval: A framework for radiology text evaluation
149
+ [Github](https://pypi.org/project/RadEval/) | [PyPI]() | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
150
+
151
+ **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
152
+
153
+ **⚠️ Performance Warning ⚠️**
154
+
155
+ The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
156
+ """
157
+ )
158
+
159
+ with gr.Row():
160
+ choice = gr.Radio(
161
+ label="πŸ“‹ Choose Example or Custom Input",
162
+ choices=["Custom"] + list(examples.keys()),
163
+ value="Custom",
164
+ interactive=True
165
+ )
166
+
167
+ with gr.Row():
168
+ with gr.Column(scale=1):
169
+ ref_input = gr.Textbox(
170
+ label="πŸ“„ Reference Report (Ground Truth)",
171
+ lines=5,
172
+ placeholder="Enter the reference radiology report here...",
173
+ info="The ground truth or expert-written report"
174
+ )
175
+ with gr.Column(scale=1):
176
+ hyp_input = gr.Textbox(
177
+ label="πŸ€– Hypothesis Report (Generated)",
178
+ lines=5,
179
+ placeholder="Enter the generated/predicted radiology report here...",
180
+ info="The AI-generated or system-produced report"
181
+ )
182
+
183
+ choice.change(
184
+ update_fields,
185
+ inputs=choice,
186
+ outputs=[ref_input, hyp_input],
187
+ )
188
+
189
+ with gr.Row():
190
+ metrics_selection = gr.CheckboxGroup(
191
+ label="🎯 Select Evaluation Metrics",
192
+ choices=available_metrics,
193
+ value=default_metrics,
194
+ interactive=True,
195
+ info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
196
+ )
197
+
198
+ with gr.Row():
199
+ run_button = gr.Button("πŸš€ Run RadEval", variant="primary", size="lg")
200
+
201
+ with gr.Row():
202
+ with gr.Column(scale=2):
203
+ analysis_output = gr.Markdown(
204
+ value="πŸ“Š **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
205
+ )
206
+ with gr.Column(scale=1):
207
+ table_output = gr.DataFrame(
208
+ label="πŸ“ˆ Detailed Scores",
209
+ headers=["Metric", "Score"],
210
+ wrap=True
211
+ )
212
+
213
+ # Information section
214
+ with gr.Accordion("πŸ’‘ Metric Information", open=False):
215
+ gr.Markdown(
216
+ """
217
+ ### πŸ“Š Available Metrics:
218
+
219
+ **Traditional NLG Metrics:**
220
+ - **BLEU**: N-gram overlap between reference and hypothesis
221
+ - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
222
+ - **BERTScore**: Semantic similarity using BERT embeddings
223
+
224
+ **Radiology-Specific Metrics:**
225
+ - **RadGraph F1**: Entity and relation extraction for radiology
226
+ - **CheXbert F1**: Chest X-ray finding classification performance
227
+ - **RaTEScore**: Radiology-aware text evaluation score
228
+ - **RadCliQ**: Composite metric for radiology reports
229
+ - **Temporal F1**: Temporal entity and relationship evaluation
230
+ - **RadEval BERTScore**: Specialized BERT for radiology text
231
+ - **GREEN**: Generative evaluation with natural language explanations
232
+ - **SRR-BERT**: Structured radiology reasoning evaluation
233
+
234
+ ### ⚑ Performance Notes:
235
+ - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
236
+ - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
237
+ - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
238
+ """
239
+ )
240
+
241
+ run_button.click(
242
+ run_radeval_simple,
243
+ inputs=[ref_input, hyp_input, metrics_selection],
244
+ outputs=[analysis_output, table_output]
245
+ )
246
+
247
+ if __name__ == "__main__":
248
+ demo.launch()