File size: 38,426 Bytes
21a375e
 
c8373c1
 
 
 
 
 
 
 
680cbe9
 
 
b65f2ef
 
7e2b9c0
c8373c1
 
 
 
b65f2ef
 
 
 
 
1a12918
b65f2ef
 
 
 
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a375e
c8373c1
21a375e
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a375e
c8373c1
 
 
 
c2c76a6
 
 
 
 
 
 
 
85d9e2b
 
 
c2c76a6
85d9e2b
e65749f
85d9e2b
e65749f
85d9e2b
e65749f
 
85d9e2b
 
 
 
 
c2c76a6
 
 
 
 
 
 
 
 
 
85d9e2b
 
 
 
 
 
 
 
 
c2c76a6
 
85d9e2b
 
 
 
 
 
 
c2c76a6
85d9e2b
 
 
c2c76a6
 
85d9e2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e65749f
 
 
 
 
 
c2c76a6
85d9e2b
 
 
 
 
 
 
 
 
 
e65749f
85d9e2b
c2c76a6
85d9e2b
c2c76a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85d9e2b
 
c2c76a6
 
 
 
 
 
 
 
85d9e2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2c76a6
c8373c1
 
 
 
 
 
 
 
 
d30ceda
 
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d30ceda
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f7b0cc
c8373c1
 
 
d30ceda
 
 
 
 
c8373c1
 
 
d30ceda
c8373c1
 
 
 
 
 
 
 
680cbe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6542cc6
 
 
 
 
 
 
 
 
 
 
 
 
680cbe9
 
 
df4f503
 
680cbe9
df4f503
 
 
 
680cbe9
df4f503
 
 
 
 
680cbe9
df4f503
 
680cbe9
df4f503
 
680cbe9
df4f503
680cbe9
df4f503
680cbe9
df4f503
 
680cbe9
df4f503
 
680cbe9
df4f503
 
680cbe9
df4f503
680cbe9
df4f503
 
 
 
 
 
 
 
 
 
 
 
680cbe9
df4f503
 
 
 
c8373c1
 
 
 
 
7e2b9c0
c8373c1
680cbe9
 
 
c8373c1
680cbe9
 
 
 
 
 
 
 
 
c8373c1
fc1afa7
680cbe9
 
 
 
 
 
 
 
 
 
c8373c1
 
b65f2ef
 
 
 
680cbe9
b65f2ef
c8373c1
7e2b9c0
 
 
 
 
 
 
 
 
 
 
fc1afa7
7e2b9c0
 
fc1afa7
 
7e2b9c0
 
 
 
 
 
 
 
 
 
 
 
b65f2ef
 
fc1afa7
b65f2ef
7e2b9c0
b65f2ef
 
 
 
 
 
 
 
7e2b9c0
 
 
 
 
 
 
 
 
 
 
 
b65f2ef
 
 
 
 
 
 
 
 
 
 
 
 
df4f503
b65f2ef
 
 
7e2b9c0
 
 
 
 
 
b65f2ef
 
680cbe9
b65f2ef
 
7e2b9c0
b65f2ef
 
 
7e2b9c0
 
b65f2ef
 
 
 
df4f503
b65f2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8373c1
680cbe9
fc1afa7
 
680cbe9
7e2b9c0
 
 
 
 
 
 
 
b65f2ef
c8373c1
 
 
 
 
 
 
 
 
 
 
 
21a375e
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21a375e
c8373c1
 
 
 
21a375e
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a12918
c8373c1
 
 
fe811f7
 
c8373c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2c76a6
 
 
c8373c1
 
 
 
 
 
 
c2c76a6
 
 
 
 
 
 
c8373c1
 
 
 
 
 
 
 
 
21a375e
c8373c1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
import gradio as gr
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
import json
import torch
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from huggingface_hub import HfApi
import shutil
import tempfile

from stark_qa import load_qa
from stark_qa.evaluator import Evaluator

from utils.hub_storage import HubStorage
from utils.token_handler import TokenHandler

# Initialize storage once at startup
try:
    REPO_ID = "snap-stanford/stark-leaderboard"  # Replace with your space name
    hub_storage = HubStorage(REPO_ID)
except Exception as e:
    raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")


def process_single_instance(args):
    idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
    query, query_id, answer_ids, meta_info = qa_dataset[idx]

    try:
        pred_rank = eval_csv[eval_csv['query_id'] == query_id]['pred_rank'].item()
    except IndexError:
        raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
    except Exception as e:
        raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')

    if isinstance(pred_rank, str):
        try:
            pred_rank = eval(pred_rank)
        except SyntaxError as e:
            raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
    
    if not isinstance(pred_rank, list):
        raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')

    pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
    answer_ids = torch.LongTensor(answer_ids)
    result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)

    result["idx"], result["query_id"] = idx, query_id
    return result


def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
    candidate_ids_dict = {
        'amazon': [i for i in range(957192)],
        'mag': [i for i in range(1172724, 1872968)],
        'prime': [i for i in range(129375)]
    }
    try:
        eval_csv = pd.read_csv(csv_path)
        if 'query_id' not in eval_csv.columns:
            raise ValueError('No `query_id` column found in the submitted csv.')
        if 'pred_rank' not in eval_csv.columns:
            raise ValueError('No `pred_rank` column found in the submitted csv.')

        eval_csv = eval_csv[['query_id', 'pred_rank']]

        if dataset not in candidate_ids_dict:
            raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
        if split not in ['test', 'test-0.1', 'human_generated_eval']:
            raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")

        evaluator = Evaluator(candidate_ids_dict[dataset])
        eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
        qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
        split_idx = qa_dataset.get_idx_split()
        all_indices = split_idx[split].tolist()

        results_list = []
        query_ids = []

        # Prepare args for each worker
        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]

        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            futures = [executor.submit(process_single_instance, arg) for arg in args]
            for future in tqdm(as_completed(futures), total=len(futures)):
                result = future.result()  # This will raise an error if the worker encountered one
                results_list.append(result)
                query_ids.append(result['query_id'])

        # Concatenate results and compute final metrics
        eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
        final_results = {
            metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
        }
        return final_results

    except pd.errors.EmptyDataError:
        return "Error: The CSV file is empty or could not be read. Please check the file and try again."
    except FileNotFoundError:
        return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
    except Exception as error:
        return f"{error}"


# Data dictionaries for leaderboard
data_synthesized_full = {
    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
    'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
    'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
    'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
    'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
    'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
    'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
    'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
    'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
    'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
    'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
    'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
    'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
}

data_synthesized_10 = {
    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
    'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
    'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
    'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
    'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
    'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
    'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
    'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
    'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
    'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
    'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
    'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
    'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
}

data_human_generated = {
    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
    'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
    'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
    'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
    'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
    'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
    'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
    'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
    'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
    'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
    'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
    'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
    'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
}

# Initialize DataFrames
df_synthesized_full = pd.DataFrame(data_synthesized_full)
df_synthesized_10 = pd.DataFrame(data_synthesized_10)
df_human_generated = pd.DataFrame(data_human_generated)

# Model type definitions
model_types = {
    'Sparse Retriever': ['BM25'],
    'Small Dense Retrievers': ['DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)'],
    'LLM-based Dense Retrievers': ['ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b'],
    'Multivector Retrievers': ['multi-ada-002', 'ColBERTv2'],
    'LLM Rerankers': ['Claude3 Reranker', 'GPT4 Reranker']
}

# Submission form validation functions
def validate_email(email_str):
    """Validate email format(s)"""
    emails = [e.strip() for e in email_str.split(';')]
    email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    return all(email_pattern.match(email) for email in emails)

def validate_github_url(url):
    """Validate GitHub URL format"""
    github_pattern = re.compile(
        r'^https?:\/\/(?:www\.)?github\.com\/[\w-]+\/[\w.-]+\/?$'
    )
    return bool(github_pattern.match(url))

def validate_csv(file_obj):
    """Validate CSV file format and content"""
    try:
        df = pd.read_csv(file_obj.name)
        required_cols = ['query_id', 'pred_rank']
        
        if not all(col in df.columns for col in required_cols):
            return False, "CSV must contain 'query_id' and 'pred_rank' columns"
            
        try:
            first_rank = eval(df['pred_rank'].iloc[0]) if isinstance(df['pred_rank'].iloc[0], str) else df['pred_rank'].iloc[0]
            if not isinstance(first_rank, list) or len(first_rank) < 20:
                return False, "pred_rank must be a list with at least 20 candidates"
        except:
            return False, "Invalid pred_rank format"
            
        return True, "Valid CSV file"
    except Exception as e:
        return False, f"Error processing CSV: {str(e)}"

def sanitize_name(name):
    """Sanitize name for file system use"""
    return re.sub(r'[^a-zA-Z0-9]', '_', name)

def scan_submissions_directory():
    """
    Scans the submissions directory and updates the leaderboard tables with all submitted results.
    Returns a dictionary mapping split names to lists of submissions.
    """
    global df_synthesized_full, df_synthesized_10, df_human_generated
    
    try:
        # Initialize HuggingFace API
        api = HfApi()
        
        # Get submissions directory content from HuggingFace hub
        try:
            repo_files = api.list_repo_files(
                repo_id=REPO_ID,
                repo_type="space"
            )
            # Filter for files in submissions directory
            repo_files = [f for f in repo_files if f.startswith('submissions/')]
        except Exception as e:
            print(f"Error listing repository contents: {str(e)}")
            return
            
        if not repo_files:
            print("No submissions directory found or empty")
            return
            
        # Track submissions for each split
        submissions_by_split = {
            'test': [],
            'test-0.1': [],
            'human_generated_eval': []
        }
        
        # Group files by team folders
        folder_files = {}
        for file_info in repo_files:
            path = file_info.path
            if not path.startswith('submissions/'):
                continue
                
            parts = path.split('/')
            if len(parts) < 3:  # submissions/folder_name/file
                continue
                
            folder_name = parts[1]
            if folder_name not in folder_files:
                folder_files[folder_name] = []
            folder_files[folder_name].append(path)
        
        # Process each team folder
        for folder_name, files in folder_files.items():
            try:
                # Look for latest.json
                latest_file = next((f for f in files if f.endswith('latest.json')), None)
                if not latest_file:
                    continue
                    
                # Read latest.json
                try:
                    latest_content = hub_storage.get_file_content(latest_file)
                    latest_info = json.loads(latest_content)
                except Exception as e:
                    print(f"Error reading latest.json for {folder_name}: {str(e)}")
                    continue
                
                if latest_info.get('status') != 'approved':
                    continue
                    
                timestamp = latest_info.get('latest_submission')
                if not timestamp:
                    continue
                    
                # Find corresponding metadata file
                metadata_file = next(
                    (f for f in files if f.endswith(f'metadata_{timestamp}.json')), 
                    None
                )
                if not metadata_file:
                    continue
                    
                # Read metadata file
                try:
                    metadata_content = api.hf_hub_download(
                        repo_id=REPO_ID,
                        repo_type="space",
                        filename=metadata_file,
                        text=True
                    )
                    submission_data = json.loads(metadata_content)
                except Exception as e:
                    print(f"Error reading metadata for {folder_name}: {str(e)}")
                    continue
                
                split = submission_data.get('Split')
                if split in submissions_by_split:
                    submissions_by_split[split].append(submission_data)
                    
                    # Update corresponding DataFrame
                    update_leaderboard_data(submission_data)
                    print(f"Added submission from {folder_name} to {split} leaderboard")
                    
            except Exception as e:
                print(f"Error processing folder {folder_name}: {str(e)}")
                continue
        
        print("Leaderboard initialized with existing submissions:")
        for split, submissions in submissions_by_split.items():
            print(f"{split}: {len(submissions)} submissions")
            
        return submissions_by_split
        
    except Exception as e:
        print(f"Error scanning submissions directory: {str(e)}")
        return None

def initialize_leaderboard():
    """
    Initialize the leaderboard with baseline results and submitted results.
    """
    global df_synthesized_full, df_synthesized_10, df_human_generated
    
    try:
        # First, initialize with baseline results
        df_synthesized_full = pd.DataFrame(data_synthesized_full)
        df_synthesized_10 = pd.DataFrame(data_synthesized_10)
        df_human_generated = pd.DataFrame(data_human_generated)
        
        print("Initialized with baseline results")
        
        # Then scan and add submitted results
        scan_submissions_directory()
        
        print("Leaderboard initialization complete")
        
    except Exception as e:
        print(f"Error initializing leaderboard: {str(e)}")

# Utility function to get file content
def get_file_content(file_path):
    """
    Helper function to safely read file content from HuggingFace repository
    """
    try:
        api = HfApi()
        content = api.file_download(
            repo_id=REPO_ID,
            repo_type="space",
            filename=file_path
        )
        return content.read().decode('utf-8')
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def save_submission(submission_data, csv_file):
    """
    Save submission data and CSV file using model_name_team_name format
    
    Args:
        submission_data (dict): Metadata and results for the submission
        csv_file: The uploaded CSV file object
    """
    # Create folder name from model name and team name
    model_name_clean = sanitize_name(submission_data['Method Name'])
    team_name_clean = sanitize_name(submission_data['Team Name'])
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create folder name: model_name_team_name
    folder_name = f"{model_name_clean}_{team_name_clean}"
    submission_id = f"{folder_name}_{timestamp}"
    
    # Create submission directory structure
    base_dir = "submissions"
    submission_dir = os.path.join(base_dir, folder_name)
    os.makedirs(submission_dir, exist_ok=True)
    
    # Save CSV file with timestamp to allow multiple submissions
    csv_filename = f"predictions_{timestamp}.csv"
    csv_path = os.path.join(submission_dir, csv_filename)
    if hasattr(csv_file, 'name'):
        with open(csv_file.name, 'rb') as source, open(csv_path, 'wb') as target:
            target.write(source.read())
    
    # Add file paths to submission data
    submission_data.update({
        "csv_path": csv_path,
        "submission_id": submission_id,
        "folder_name": folder_name
    })
    
    # Save metadata as JSON with timestamp
    metadata_path = os.path.join(submission_dir, f"metadata_{timestamp}.json")
    with open(metadata_path, 'w') as f:
        json.dump(submission_data, f, indent=4)
    
    # Update latest.json to track most recent submission
    latest_path = os.path.join(submission_dir, "latest.json")
    with open(latest_path, 'w') as f:
        json.dump({
            "latest_submission": timestamp,
            "status": "pending_review",
            "method_name": submission_data['Method Name']
        }, f, indent=4)
    
    return submission_id

def update_leaderboard_data(submission_data):
    """
    Update leaderboard data with new submission results
    Only uses model name in the displayed table
    """
    global df_synthesized_full, df_synthesized_10, df_human_generated
    
    # Determine which DataFrame to update based on split
    split_to_df = {
        'test': df_synthesized_full,
        'test-0.1': df_synthesized_10,
        'human_generated_eval': df_human_generated
    }
    
    df_to_update = split_to_df[submission_data['Split']]
    
    # Prepare new row data
    new_row = {
        'Method': submission_data['Method Name'],  # Only use method name in table
        f'STARK-{submission_data["Dataset"].upper()}_Hit@1': submission_data['results']['hit@1'],
        f'STARK-{submission_data["Dataset"].upper()}_Hit@5': submission_data['results']['hit@5'],
        f'STARK-{submission_data["Dataset"].upper()}_R@20': submission_data['results']['recall@20'],
        f'STARK-{submission_data["Dataset"].upper()}_MRR': submission_data['results']['mrr']
    }
    
    # Check if method already exists
    method_mask = df_to_update['Method'] == submission_data['Method Name']
    if method_mask.any():
        # Update existing row
        for col in new_row:
            df_to_update.loc[method_mask, col] = new_row[col]
    else:
        # Add new row
        df_to_update.loc[len(df_to_update)] = new_row

# Function to get emails from meta_data
def get_emails_from_metadata(meta_data):
    """
    Extracts emails from the meta_data dictionary.
    
    Args:
        meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
    
    Returns:
        list: A list of email addresses.
    """
    return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]

# Function to format meta_data as an HTML table (without Prediction CSV)
def format_metadata_as_table(meta_data):
    """
    Formats metadata dictionary into an HTML table for the email.
    Handles multiple contact emails separated by a semicolon.

    Args:
        meta_data (dict): Dictionary containing submission metadata.

    Returns:
        str: HTML string representing the metadata table.
    """
    table_rows = ""
    
    for key, value in meta_data.items():
        if key == "Contact Email(s)":
            # Ensure that contact emails are split by semicolon
            emails = value.split(';')
            formatted_emails = "; ".join([email.strip() for email in emails])
            table_rows += f"<tr><td><b>{key}</b></td><td>{formatted_emails}</td></tr>"
        elif key != "Prediction CSV":  # Exclude the Prediction CSV field
            table_rows += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"

    table_html = f"""
    <table border="1" cellpadding="5" cellspacing="0">
        {table_rows}
    </table>
    """
    return table_html

# Function to get emails from meta_data
def get_emails_from_metadata(meta_data):
    """
    Extracts emails from the meta_data dictionary.
    
    Args:
        meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
    
    Returns:
        list: A list of email addresses.
    """
    return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
            
def send_error_notification(meta_data, error_info):
    """
    Sends an email notification about an error during the evaluation process.

    Args:
        meta_data (dict): Submission metadata to be included in the email.
        error_info (str): Error message or notification content to be included in the email.

    Returns:
        None
    """
    emails_to_send = get_emails_from_metadata(meta_data)
    send_from = 'stark-qa@cs.stanford.edu'
    recipients_str = ', '.join(emails_to_send)

    # Create the email container
    msg = MIMEMultipart('alternative')
    msg['Subject'] = 'STaRK Leaderboard Submission - Error Notification'
    msg['From'] = send_from
    msg['To'] = recipients_str

    # Format the metadata table
    metadata_table = format_metadata_as_table(meta_data)

    # Email body content with metadata table
    body = f"""
    <p>Dear STaRK Leaderboard Participant,</p>

    <p>We encountered an issue during the evaluation of your recent submission:</p>

    <p><i>{error_info}</i></p>

    <p>Please verify your inputs and resubmit. If the issue persists, feel free to contact us at stark-qa@cs.stanford.edu with the error details and your dataset information.</p>

    <p>Submitted Metadata:</p>
    {metadata_table}

    <p>Thank you for your participation.</p>

    <p>Best regards,<br>The STaRK QA Team</p>
    """

    msg.attach(MIMEText(body, 'html'))

    # Send the email
    try:
        with smtplib.SMTP('localhost') as server:
            server.sendmail(send_from, emails_to_send, msg.as_string())  # No CC for error notification
        print("Error notification sent successfully.")
    except Exception as e:
        print(f"Failed to send error notification: {e}")

def format_evaluation_results(results):
    """
    Formats the evaluation results dictionary into a readable string.

    Args:
        results (dict): Dictionary containing evaluation metrics and their values.

    Returns:
        str: Formatted string of evaluation results.
    """
    result_lines = [f"{metric}: {value}" for metric, value in results.items()]
    return "\n".join(result_lines)

# Function to send a submission confirmation with evaluation results and metadata, CCing the sender
def send_submission_confirmation(meta_data, eval_results):
    """
    Sends an email notification confirming submission and including evaluation results.
    Modified to handle SMTP connection properly.
    """
    try:
        emails_to_send = get_emails_from_metadata(meta_data)
        send_from = 'stark-qa@cs.stanford.edu'
        recipients_str = ', '.join(emails_to_send)

        msg = MIMEMultipart('alternative')
        msg['Subject'] = 'STaRK Leaderboard Submission - Evaluation Results'
        msg['From'] = send_from
        msg['To'] = recipients_str
        msg['Cc'] = send_from

        formatted_results = format_evaluation_results(eval_results)
        metadata_table = format_metadata_as_table(meta_data)

        body = f"""
        <p>Dear STaRK Leaderboard Participant,</p>

        <p>Thank you for your submission to the STaRK leaderboard. Below are the results of your submission:</p>

        <pre>{formatted_results}</pre>

        <p>Submitted Metadata:</p>
        {metadata_table}

        <p>Your results have been added to the leaderboard. If you would like to withdraw your submission, 
        please reply to this email with "withdrawn."</p>

        <p>Best regards,<br>The STaRK QA Team</p>
        """

        msg.attach(MIMEText(body, 'html'))

        # Modified SMTP connection handling
        try:
            # First try localhost
            with smtplib.SMTP('localhost') as server:
                server.send_message(msg)
        except:
            # If localhost fails, try connecting to a remote SMTP server
            with smtplib.SMTP('smtp.stanford.edu', 587) as server:
                server.starttls()
                server.send_message(msg)
                
        print(f"Submission confirmation sent successfully to {recipients_str}")
    except Exception as e:
        print(f"Warning: Failed to send email notification: {str(e)}")
        # Continue with submission even if email fails
        pass
    
def process_submission(
    method_name, team_name, dataset, split, contact_email,
    code_repo, csv_file, model_description, hardware, paper_link
):
    """Process and validate submission"""
    temp_files = []
    try:
        # Input validation
        if not all([method_name, team_name, dataset, split, contact_email, code_repo, csv_file]):
            return "Error: Please fill in all required fields"
        
        # Length validation
        if len(method_name) > 25:
            return "Error: Method name must be 25 characters or less"
        if len(team_name) > 25:
            return "Error: Team name must be 25 characters or less"
        if not validate_email(contact_email):
            return "Error: Invalid email format"
        if not validate_github_url(code_repo):
            return "Error: Invalid GitHub repository URL"
        
        # Create metadata at the beginning to ensure it's available for error handling
        meta_data = {
            "Method Name": method_name,
            "Team Name": team_name,
            "Dataset": dataset,
            "Split": split,
            "Contact Email(s)": contact_email,
            "Code Repository": code_repo,
            "Model Description": model_description,
            "Hardware": hardware,
            "(Optional) Paper link": paper_link
        }
        
        # Save and process files
        REPO_ID = "snap-stanford/stark-leaderboard"  # Replace with your space name
        HF_TOKEN = os.getenv("HF_TOKEN")

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        folder_name = f"{sanitize_name(method_name)}_{sanitize_name(team_name)}"
        
        temp_csv_path = None
        if isinstance(csv_file, str):
            # If it's already a file path, use it directly
            temp_csv_path = csv_file
        else:
            # Create a temporary file with a .csv extension
            temp_fd, temp_csv_path = tempfile.mkstemp(suffix='.csv')
            temp_files.append(temp_csv_path)
            os.close(temp_fd)
            
            # Write the content to the temporary file
            if hasattr(csv_file, 'name'):
                # If it's a file object with a name attribute
                shutil.copy2(csv_file.name, temp_csv_path)
            else:
                # If it's a file-like object
                with open(temp_csv_path, 'wb') as temp_file:
                    if hasattr(csv_file, 'seek'):
                        csv_file.seek(0)
                    if hasattr(csv_file, 'read'):
                        shutil.copyfileobj(csv_file, temp_file)
                    else:
                        temp_file.write(csv_file)

        # Verify the CSV file exists and is readable
        if not os.path.exists(temp_csv_path):
            raise FileNotFoundError(f"Failed to create temporary CSV file at {temp_csv_path}")

        
        # Process evaluation
        # Use the temporary file path for evaluation
        results = compute_metrics(
            csv_path=temp_csv_path,  # Use the temporary file path
            dataset=dataset.lower(),
            split=split,
            num_workers=4
        )
        
        if isinstance(results, str):
            send_error_notification(meta_data, results)
            return f"Evaluation error: {results}"

        csv_filename = f"predictions_{timestamp}.csv"
        csv_path_in_repo = f"submissions/{folder_name}/{csv_filename}"

        try:
            hub_storage.save_to_hub(
                file_content=temp_csv_path,
                path_in_repo=csv_path_in_repo,
                commit_message=f"Add submission: {method_name} by {team_name}"
            )
        except Exception as e:
            raise RuntimeError(f"Failed to save CSV to HuggingFace Hub: {str(e)}")
        
        # Process results (multiply by 100)
        processed_results = {
            "hit@1": round(results['hit@1'] * 100, 2),
            "hit@5": round(results['hit@5'] * 100, 2),
            "recall@20": round(results['recall@20'] * 100, 2),
            "mrr": round(results['mrr'] * 100, 2)
        }
        
        # Save metadata
        submission_data = {
            **meta_data,
            "results": processed_results,
            "status": "approved", #pending_review
            "submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "csv_path": csv_path_in_repo
        }
        metadata_fd, temp_metadata_path = tempfile.mkstemp(suffix='.json')
        temp_files.append(temp_metadata_path)
        os.close(metadata_fd)
        
        with open(temp_metadata_path, 'w') as f:
            json.dump(submission_data, f, indent=4)
        
        metadata_path = f"submissions/{folder_name}/metadata_{timestamp}.json"
            
        try:
            hub_storage.save_to_hub(
                file_content=temp_metadata_path,
                path_in_repo=metadata_path,
                commit_message=f"Add metadata: {method_name} by {team_name}"
            )
        except Exception as e:
            raise RuntimeError(f"Failed to save metadata to HuggingFace Hub: {str(e)}")
        
        # Send confirmation email and update leaderboard
        send_submission_confirmation(meta_data, processed_results)
        update_leaderboard_data(submission_data)
        demo.update()
        
        return f"""
        Submission successful! 
        
        Evaluation Results:
        Hit@1: {processed_results['hit@1']:.2f}%
        Hit@5: {processed_results['hit@5']:.2f}%
        Recall@20: {processed_results['recall@20']:.2f}%
        MRR: {processed_results['mrr']:.2f}%
        
        Your submission has been saved and a confirmation email has been sent to {contact_email}.
        Once approved, your results will appear in the leaderboard under: {method_name}
        
        You can find your submission at:
        https://huggingface.co/spaces/{REPO_ID}/tree/main/submissions/{folder_name}
        """
        
    except Exception as e:
        error_message = f"Error processing submission: {str(e)}"
        # meta_data will always be defined here since we create it at the beginning
        send_error_notification(meta_data, error_message)
        return error_message
    finally:
        # Clean up temporary files
        for temp_file in temp_files:
            try:
                if os.path.exists(temp_file):
                    os.unlink(temp_file)
            except Exception as e:
                print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
    
def filter_by_model_type(df, selected_types):
    if not selected_types:
        return df.head(0)
    selected_models = [model for type in selected_types for model in model_types[type]]
    return df[df['Method'].isin(selected_models)]

def format_dataframe(df, dataset):
    columns = ['Method'] + [col for col in df.columns if dataset in col]
    filtered_df = df[columns].copy()
    filtered_df.columns = [col.split('_')[-1] if '_' in col else col for col in filtered_df.columns]
    filtered_df = filtered_df.sort_values('MRR', ascending=False)
    return filtered_df

def update_tables(selected_types):
    filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
    filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
    filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
    
    outputs = []
    for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
        for dataset in ['AMAZON', 'MAG', 'PRIME']:
            outputs.append(format_dataframe(df, f"STARK-{dataset}"))
    
    return outputs


css = """
table > thead {
    white-space: normal
}

table {
    --cell-width-1: 250px
}

table > tbody > tr > td:nth-child(2) > div {
    overflow-x: auto
}

.tab-nav {
    border-bottom: 1px solid rgba(255, 255, 255, 0.1);
    margin-bottom: 1rem;
}
"""

# Main application
with gr.Blocks(css=css) as demo:
    gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
    gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
    
    # Model type filter
    model_type_filter = gr.CheckboxGroup(
        choices=list(model_types.keys()),
        value=list(model_types.keys()),
        label="Model types",
        interactive=True
    )
    
    # Initialize dataframes list
    all_dfs = []
    
    # Create nested tabs structure
    with gr.Tabs() as outer_tabs:
        with gr.TabItem("Synthesized (full)"):
            with gr.Tabs() as inner_tabs1:
                for dataset in ['AMAZON', 'MAG', 'PRIME']:
                    with gr.TabItem(dataset):
                        all_dfs.append(gr.DataFrame(interactive=False))
                        
        with gr.TabItem("Synthesized (10%)"):
            with gr.Tabs() as inner_tabs2:
                for dataset in ['AMAZON', 'MAG', 'PRIME']:
                    with gr.TabItem(dataset):
                        all_dfs.append(gr.DataFrame(interactive=False))
                        
        with gr.TabItem("Human-Generated"):
            with gr.Tabs() as inner_tabs3:
                for dataset in ['AMAZON', 'MAG', 'PRIME']:
                    with gr.TabItem(dataset):
                        all_dfs.append(gr.DataFrame(interactive=False))
    
    # Submission section
    gr.Markdown("---")
    gr.Markdown("## Submit Your Results")
    gr.Markdown("""
    Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
    For questions, contact stark-qa@cs.stanford.edu
    """)
    
    with gr.Row():
        with gr.Column():
            method_name = gr.Textbox(
                label="Method Name (max 25 chars)*",
                placeholder="e.g., MyRetrievalModel-v1"
            )
            team_name = gr.Textbox(
                label="Team Name (max 25 chars)*",
                placeholder="e.g., Stanford NLP"
            )
            dataset = gr.Dropdown(
                choices=["amazon", "mag", "prime"],
                label="Dataset*",
                value="amazon"
            )
            split = gr.Dropdown(
                choices=["test", "test-0.1", "human_generated_eval"],
                label="Split*",
                value="test"
            )
            contact_email = gr.Textbox(
                label="Contact Email(s)*",
                placeholder="email@example.com; another@example.com"
            )
        
        with gr.Column():
            code_repo = gr.Textbox(
                label="Code Repository*",
                placeholder="https://github.com/snap-stanford/stark-leaderboard"
            )
            csv_file = gr.File(
                label="Prediction CSV*",
                file_types=[".csv"],
                type="filepath"  # Important: specify type as filepath
            )
            model_description = gr.Textbox(
                label="Model Description*",
                lines=3,
                placeholder="Briefly describe how your retriever model works..."
            )
            hardware = gr.Textbox(
                label="Hardware Specifications*",
                placeholder="e.g., 4x NVIDIA A100 80GB"
            )
            paper_link = gr.Textbox(
                label="Paper Link (Optional)",
                placeholder="https://arxiv.org/abs/..."
            )
    
    submit_btn = gr.Button("Submit", variant="primary")
    result = gr.Textbox(label="Submission Status", interactive=False)
    
    # Initialize leaderboard at startup
    initialize_leaderboard()
    
    # Set up event handlers
    model_type_filter.change(
        update_tables,
        inputs=[model_type_filter],
        outputs=all_dfs
    )
    
    # Initial table update
    demo.load(
        update_tables,
        inputs=[model_type_filter],
        outputs=all_dfs
    )
    
    submit_btn.click(
        process_submission,
        inputs=[
            method_name, team_name, dataset, split, contact_email,
            code_repo, csv_file, model_description, hardware, paper_link
        ],
        outputs=result
    )
    

# Launch the application
demo.launch()