Spaces:

snap-stanford
/

stark-leaderboard

Running

App Files Files Community

Shiyu Zhao commited on Nov 14, 2024

Commit

d38a2a4

1 Parent(s): e6f9c92

Update space

Browse files

Files changed (1) hide show

app.py +45 -42

app.py CHANGED Viewed

@@ -36,36 +36,35 @@ except Exception as e:
 result_lock = Lock()
 def process_single_instance(args):
-    idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
     query, query_id, answer_ids, meta_info = qa_dataset[idx]
     try:
-        # Using loc instead of direct boolean indexing for thread safety
-        with result_lock:
-            matching_rows = eval_csv.loc[eval_csv['query_id'] == query_id]
-            if matching_rows.empty:
-                raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
-            pred_rank = matching_rows['pred_rank'].iloc[0]
-    except IndexError:
-        raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
     except Exception as e:
-        raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')
-    if isinstance(pred_rank, str):
-        try:
-            pred_rank = eval(pred_rank)
-        except SyntaxError as e:
-            raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
-    if not isinstance(pred_rank, list):
-        raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
-    pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
-    answer_ids = torch.LongTensor(answer_ids)
-    # Evaluate metrics
-    result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
-    result["idx"], result["query_id"] = idx, query_id
-    return result
 def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int = 4):
     candidate_ids_dict = {
@@ -82,8 +81,8 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int =
         if 'pred_rank' not in eval_csv.columns:
             raise ValueError('No `pred_rank` column found in the submitted csv.')
-        # Filter required columns
-        eval_csv = eval_csv[['query_id', 'pred_rank']]
         # Validate input parameters
         if dataset not in candidate_ids_dict:
@@ -100,13 +99,14 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int =
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
-        # Thread-safe containers
         results_list = []
-        query_ids = []
         results_lock = Lock()
         # Prepare args for each thread
-        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
         # Process using threads
         with ThreadPoolExecutor(max_workers=num_threads) as executor:
@@ -117,19 +117,23 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int =
                     result = future.result()
                     with results_lock:
                         results_list.append(result)
-                        query_ids.append(result['query_id'])
                 except Exception as e:
                     print(f"Error processing instance: {str(e)}")
-        # Concatenate results and compute final metrics
-        with result_lock:
-            results_df = pd.DataFrame(results_list)
-            eval_csv = pd.concat([eval_csv, results_df], ignore_index=True)
-            final_results = {
-                metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric])
-                for metric in eval_metrics
-            }
         return final_results
@@ -139,7 +143,6 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int =
         return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
     except Exception as error:
         return f"{error}"
 # Data dictionaries for leaderboard

 result_lock = Lock()
 def process_single_instance(args):
+    idx, eval_dict, qa_dataset, evaluator, eval_metrics = args
     query, query_id, answer_ids, meta_info = qa_dataset[idx]
     try:
+        # Access prediction using dictionary instead of DataFrame
+        if query_id not in eval_dict:
+            raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
+        pred_rank = eval_dict[query_id]
+        if isinstance(pred_rank, str):
+            try:
+                pred_rank = eval(pred_rank)
+            except SyntaxError as e:
+                raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
+        if not isinstance(pred_rank, list):
+            raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
+        pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
+        answer_ids = torch.LongTensor(answer_ids)
+        # Evaluate metrics
+        result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
+        result["idx"], result["query_id"] = idx, query_id
+        return result
     except Exception as e:
+        raise RuntimeError(f'Error processing query_id={query_id}: {str(e)}')
 def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int = 4):
     candidate_ids_dict = {
         if 'pred_rank' not in eval_csv.columns:
             raise ValueError('No `pred_rank` column found in the submitted csv.')
+        # Convert DataFrame to dictionary for thread-safe access
+        eval_dict = dict(zip(eval_csv['query_id'], eval_csv['pred_rank']))
         # Validate input parameters
         if dataset not in candidate_ids_dict:
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
+        # Thread-safe containers for results
         results_list = []
         results_lock = Lock()
         # Prepare args for each thread
+        args = [(idx, eval_dict, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
+        failed_queries = []  # Track failed queries
         # Process using threads
         with ThreadPoolExecutor(max_workers=num_threads) as executor:
                     result = future.result()
                     with results_lock:
                         results_list.append(result)
                 except Exception as e:
+                    query_id = str(e).split('query_id=')[-1].split(':')[0]
+                    failed_queries.append(query_id)
                     print(f"Error processing instance: {str(e)}")
+        if failed_queries:
+            print(f"\nFailed to process {len(failed_queries)} queries.")
+            print(f"First few failed query_ids: {failed_queries[:5]}")
+        if not results_list:
+            raise ValueError("No results were successfully processed")
+        # Compute final metrics
+        results_df = pd.DataFrame(results_list)
+        final_results = {
+            metric: np.mean(results_df[metric]) for metric in eval_metrics
+        }
         return final_results
         return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
     except Exception as error:
         return f"{error}"
 # Data dictionaries for leaderboard