Spaces:

snap-stanford
/

stark-leaderboard

Running

App Files Files Community

Shiyu Zhao commited on Nov 14, 2024

Commit

df3974d

1 Parent(s): d38a2a4

Update space

Browse files

Files changed (2) hide show

app.py +144 -78
submissions/ance_test_abc/predictions_20241115_001153.csv +0 -0

app.py CHANGED Viewed

@@ -36,113 +36,179 @@ except Exception as e:
 result_lock = Lock()
 def process_single_instance(args):
-    idx, eval_dict, qa_dataset, evaluator, eval_metrics = args
-    query, query_id, answer_ids, meta_info = qa_dataset[idx]
     try:
-        # Access prediction using dictionary instead of DataFrame
-        if query_id not in eval_dict:
-            raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
-        pred_rank = eval_dict[query_id]
-        if isinstance(pred_rank, str):
-            try:
                 pred_rank = eval(pred_rank)
-            except SyntaxError as e:
-                raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
         if not isinstance(pred_rank, list):
-            raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
-        pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
         answer_ids = torch.LongTensor(answer_ids)
-        # Evaluate metrics
         result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
         result["idx"], result["query_id"] = idx, query_id
         return result
     except Exception as e:
-        raise RuntimeError(f'Error processing query_id={query_id}: {str(e)}')
-def compute_metrics(csv_path: str, dataset: str, split: str, num_threads: int = 4):
-    candidate_ids_dict = {
-        'amazon': [i for i in range(957192)],
-        'mag': [i for i in range(1172724, 1872968)],
-        'prime': [i for i in range(129375)]
-    }
     try:
-        # Read and validate CSV
         eval_csv = pd.read_csv(csv_path)
-        if 'query_id' not in eval_csv.columns:
-            raise ValueError('No `query_id` column found in the submitted csv.')
-        if 'pred_rank' not in eval_csv.columns:
-            raise ValueError('No `pred_rank` column found in the submitted csv.')
-        # Convert DataFrame to dictionary for thread-safe access
-        eval_dict = dict(zip(eval_csv['query_id'], eval_csv['pred_rank']))
-        # Validate input parameters
-        if dataset not in candidate_ids_dict:
-            raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
-        if split not in ['test', 'test-0.1', 'human_generated_eval']:
-            raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
-        # Initialize evaluator and metrics
         evaluator = Evaluator(candidate_ids_dict[dataset])
         eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
-        # Load dataset and get split indices
         qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
-        # Thread-safe containers for results
         results_list = []
-        results_lock = Lock()
-        # Prepare args for each thread
-        args = [(idx, eval_dict, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
-        failed_queries = []  # Track failed queries
-        # Process using threads
-        with ThreadPoolExecutor(max_workers=num_threads) as executor:
-            futures = [executor.submit(process_single_instance, arg) for arg in args]
-            for future in tqdm(as_completed(futures), total=len(futures)):
-                try:
-                    result = future.result()
-                    with results_lock:
-                        results_list.append(result)
-                except Exception as e:
-                    query_id = str(e).split('query_id=')[-1].split(':')[0]
-                    failed_queries.append(query_id)
-                    print(f"Error processing instance: {str(e)}")
-        if failed_queries:
-            print(f"\nFailed to process {len(failed_queries)} queries.")
-            print(f"First few failed query_ids: {failed_queries[:5]}")
-        if not results_list:
-            raise ValueError("No results were successfully processed")
         # Compute final metrics
         results_df = pd.DataFrame(results_list)
         final_results = {
-            metric: np.mean(results_df[metric]) for metric in eval_metrics
         }
         return final_results
-    except pd.errors.EmptyDataError:
-        return "Error: The CSV file is empty or could not be read. Please check the file and try again."
-    except FileNotFoundError:
-        return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
     except Exception as error:
-        return f"{error}"
 # Data dictionaries for leaderboard

 result_lock = Lock()
 def process_single_instance(args):
+    """Process a single instance with improved prediction handling"""
+    idx, eval_csv, qa_dataset, evaluator, eval_metrics, max_candidate_id = args
     try:
+        query, query_id, answer_ids, meta_info = qa_dataset[idx]
+        # Get predictions with better error handling
+        matching_preds = eval_csv[eval_csv['query_id'] == query_id]['pred_rank']
+        if len(matching_preds) == 0:
+            print(f"Warning: No prediction found for query_id {query_id}")
+            return None
+        elif len(matching_preds) > 1:
+            print(f"Warning: Multiple predictions found for query_id {query_id}, using first one")
+            pred_rank = matching_preds.iloc[0]
+        else:
+            pred_rank = matching_preds.iloc[0]
+        # Parse prediction
+        try:
+            if isinstance(pred_rank, str):
                 pred_rank = eval(pred_rank)
+            elif isinstance(pred_rank, list):
+                pass
+            else:
+                print(f"Warning: Unexpected pred_rank type for query_id {query_id}: {type(pred_rank)}")
+                return None
+        except Exception as e:
+            print(f"Error parsing pred_rank for query_id {query_id}: {str(e)}")
+            return None
+        # Validate and filter predictions
         if not isinstance(pred_rank, list):
+            print(f"Warning: pred_rank is not a list for query_id {query_id}")
+            return None
+        valid_ranks = [rank for rank in pred_rank if isinstance(rank, (int, np.integer)) and 0 <= rank < max_candidate_id]
+        if len(valid_ranks) == 0:
+            print(f"Warning: No valid predictions for query_id {query_id}")
+            return None
+        # Use only valid predictions
+        pred_dict = {valid_ranks[i]: -i for i in range(min(100, len(valid_ranks)))}
         answer_ids = torch.LongTensor(answer_ids)
         result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
         result["idx"], result["query_id"] = idx, query_id
         return result
     except Exception as e:
+        print(f"Error processing idx {idx}: {str(e)}")
+        return None
+def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
+    """Compute metrics with improved prediction handling"""
+    start_time = time.time()
     try:
+        print(f"\nStarting compute_metrics for {dataset} {split}")
+        # Load CSV and validate format
+        print("Loading and validating CSV file...")
         eval_csv = pd.read_csv(csv_path)
+        if 'query_id' not in eval_csv.columns or 'pred_rank' not in eval_csv.columns:
+            raise ValueError("CSV must contain 'query_id' and 'pred_rank' columns")
+        # Check for duplicate query_ids
+        duplicate_queries = eval_csv['query_id'].duplicated()
+        if duplicate_queries.any():
+            dup_count = duplicate_queries.sum()
+            print(f"Warning: Found {dup_count} duplicate query_ids in CSV")
+        # Keep only necessary columns
+        eval_csv = eval_csv[['query_id', 'pred_rank']]
+        print(f"CSV loaded, shape: {eval_csv.shape}")
+        # Get dataset-specific candidate size
+        candidate_size_dict = {
+            'amazon': 957192,
+            'mag': 700244,  # 1872968 - 1172724
+            'prime': 129375
+        }
+        if dataset not in candidate_size_dict:
+            raise ValueError(f"Invalid dataset '{dataset}'")
+        max_candidate_id = candidate_size_dict[dataset]
+        print(f"Dataset {dataset} has {max_candidate_id} candidates")
         evaluator = Evaluator(candidate_ids_dict[dataset])
         eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
         qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
         split_idx = qa_dataset.get_idx_split()
         all_indices = split_idx[split].tolist()
+        print(f"Processing {len(all_indices)} instances...")
+        # Process in batches using ThreadPoolExecutor
+        batch_size = 100
         results_list = []
+        progress_queue = Queue()
+        valid_results_count = 0
+        error_count = 0
+        def process_batch(batch_indices):
+            nonlocal valid_results_count, error_count
+            batch_results = []
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                futures = [
+                    executor.submit(process_single_instance,
+                                 (idx, eval_csv, qa_dataset, evaluator, eval_metrics, max_candidate_id))
+                    for idx in batch_indices
+                ]
+                for future in futures:
+                    try:
+                        result = future.result()
+                        if result is not None:
+                            batch_results.append(result)
+                            valid_results_count += 1
+                        else:
+                            error_count += 1
+                    except Exception as e:
+                        print(f"Error in batch processing: {str(e)}")
+                        error_count += 1
+                    progress_queue.put(1)
+            return batch_results
+        # Process batches with progress tracking
+        total_batches = (len(all_indices) + batch_size - 1) // batch_size
+        remaining_indices = len(all_indices)
+        def update_progress():
+            with tqdm(total=len(all_indices), desc="Processing instances") as pbar:
+                completed = 0
+                while completed < len(all_indices):
+                    progress_queue.get()
+                    completed += 1
+                    pbar.update(1)
+        # Start progress monitoring thread
+        progress_thread = threading.Thread(target=update_progress)
+        progress_thread.start()
+        # Process batches
+        for i in range(0, len(all_indices), batch_size):
+            batch_indices = all_indices[i:min(i + batch_size, len(all_indices))]
+            batch_results = process_batch(batch_indices)
+            results_list.extend(batch_results)
+            remaining_indices -= len(batch_indices)
+            print(f"\rBatch {i//batch_size + 1}/{total_batches} completed. "
+                  f"Valid: {valid_results_count}, Errors: {error_count}, Remaining: {remaining_indices}")
+        progress_thread.join()
         # Compute final metrics
+        if not results_list:
+            raise ValueError("No valid results were produced")
+        print(f"\nProcessing complete. Valid results: {valid_results_count}, Errors: {error_count}")
         results_df = pd.DataFrame(results_list)
         final_results = {
+            metric: results_df[metric].mean()
+            for metric in eval_metrics
         }
+        elapsed_time = time.time() - start_time
+        print(f"\nMetrics computation completed in {elapsed_time:.2f} seconds")
         return final_results
     except Exception as error:
+        elapsed_time = time.time() - start_time
+        error_msg = f"Error in compute_metrics ({elapsed_time:.2f}s): {str(error)}"
+        print(error_msg)
+        return error_msg
 # Data dictionaries for leaderboard

submissions/ance_test_abc/predictions_20241115_001153.csv DELETED Viewed

The diff for this file is too large to render. See raw diff