tfrere commited on
Commit
3964afa
·
1 Parent(s): 5178191

translate some comments in english

Browse files
backend/clean_and_restart_eval.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- Script pour relancer l'évaluation LightEval avec un timeout personnalisé
4
  """
5
  import os
6
  import sys
@@ -9,7 +9,7 @@ import asyncio
9
  from pathlib import Path
10
  from datetime import datetime
11
 
12
- # Importer la tâche d'évaluation
13
  from tasks.evaluation_task import EvaluationTask, DEFAULT_EVALUATION_TIMEOUT
14
 
15
 
@@ -27,33 +27,33 @@ async def main(session_id, dataset_name, timeout=None):
27
  dataset_name: Nom du dataset à évaluer
28
  timeout: Timeout en secondes pour chaque évaluation de modèle (utilise la valeur par défaut si None)
29
  """
30
- # Vérifier que le dossier de session existe
31
  session_dir = Path(f"uploaded_files/{session_id}")
32
  if not session_dir.exists():
33
  log(f"Erreur: Le dossier de session {session_id} n'existe pas")
34
  return 1
35
 
36
- # Afficher le timeout utilisé
37
  timeout_value = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
38
  log(f"Utilisation d'un timeout de {timeout_value} secondes pour l'évaluation")
39
 
40
- # Créer une nouvelle tâche d'évaluation avec le timeout spécifié
41
  log("Initialisation d'une nouvelle tâche d'évaluation")
42
  evaluation_task = EvaluationTask(
43
  session_uid=session_id,
44
  dataset_name=dataset_name,
45
- clean_old_results=True, # Nettoyer automatiquement les anciens résultats
46
  timeout=timeout
47
  )
48
 
49
- # Exécuter l'évaluation
50
  log("Démarrage de l'évaluation...")
51
  await evaluation_task.run()
52
 
53
- # Vérifier les résultats
54
  if evaluation_task.is_completed:
55
  log("Évaluation terminée avec succès")
56
- # Trier les résultats par accuracy
57
  results_sorted = sorted(evaluation_task.results, key=lambda x: x.get('accuracy', 0), reverse=True)
58
  log(f"Résultats: {results_sorted}")
59
  else:
@@ -71,6 +71,6 @@ if __name__ == "__main__":
71
 
72
  args = parser.parse_args()
73
 
74
- # Exécuter la fonction principale de manière asynchrone
75
  exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.timeout))
76
  sys.exit(exit_code)
 
1
  #!/usr/bin/env python3
2
  """
3
+ Script for relaunching LightEval evaluation with a custom timeout
4
  """
5
  import os
6
  import sys
 
9
  from pathlib import Path
10
  from datetime import datetime
11
 
12
+ # Import evaluation task
13
  from tasks.evaluation_task import EvaluationTask, DEFAULT_EVALUATION_TIMEOUT
14
 
15
 
 
27
  dataset_name: Nom du dataset à évaluer
28
  timeout: Timeout en secondes pour chaque évaluation de modèle (utilise la valeur par défaut si None)
29
  """
30
+ # Check that session folder exists
31
  session_dir = Path(f"uploaded_files/{session_id}")
32
  if not session_dir.exists():
33
  log(f"Erreur: Le dossier de session {session_id} n'existe pas")
34
  return 1
35
 
36
+ # Display used timeout
37
  timeout_value = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
38
  log(f"Utilisation d'un timeout de {timeout_value} secondes pour l'évaluation")
39
 
40
+ # Create new evaluation task with specified timeout
41
  log("Initialisation d'une nouvelle tâche d'évaluation")
42
  evaluation_task = EvaluationTask(
43
  session_uid=session_id,
44
  dataset_name=dataset_name,
45
+ clean_old_results=True, # Automatically clean old results
46
  timeout=timeout
47
  )
48
 
49
+ # Run evaluation
50
  log("Démarrage de l'évaluation...")
51
  await evaluation_task.run()
52
 
53
+ # Check results
54
  if evaluation_task.is_completed:
55
  log("Évaluation terminée avec succès")
56
+ # Sort results by accuracy
57
  results_sorted = sorted(evaluation_task.results, key=lambda x: x.get('accuracy', 0), reverse=True)
58
  log(f"Résultats: {results_sorted}")
59
  else:
 
71
 
72
  args = parser.parse_args()
73
 
74
+ # Run main function asynchronously
75
  exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.timeout))
76
  sys.exit(exit_code)
backend/config/models_config.py CHANGED
@@ -29,14 +29,14 @@ DEFAULT_EVALUATION_MODELS = [
29
  # "mistralai/Mistral-Small-24B-Instruct-2501",
30
  # ]
31
 
32
- # Modèles alternatifs à utiliser si le modèle par défaut n'est pas disponible
33
  ALTERNATIVE_BENCHMARK_MODELS = [
34
  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
35
  "meta-llama/Llama-3.3-70B-Instruct",
36
  "meta-llama/Llama-3.1-8B-Instruct",
37
  "Qwen/Qwen2.5-72B-Instruct",
38
  "mistralai/Mistral-Small-24B-Instruct-2501",
39
- # Modèles open-source qui peuvent fonctionner sans authentification
40
  "HuggingFaceH4/zephyr-7b-beta",
41
  "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
42
  "microsoft/phi-2",
 
29
  # "mistralai/Mistral-Small-24B-Instruct-2501",
30
  # ]
31
 
32
+ # Alternative models to use if default model is not available
33
  ALTERNATIVE_BENCHMARK_MODELS = [
34
  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
35
  "meta-llama/Llama-3.3-70B-Instruct",
36
  "meta-llama/Llama-3.1-8B-Instruct",
37
  "Qwen/Qwen2.5-72B-Instruct",
38
  "mistralai/Mistral-Small-24B-Instruct-2501",
39
+ # Open-source models that can work without authentication
40
  "HuggingFaceH4/zephyr-7b-beta",
41
  "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
42
  "microsoft/phi-2",
backend/lighteval_task/lighteval_task.py CHANGED
@@ -143,49 +143,49 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
143
 
144
 
145
  def process_judge_response_yourbench(response):
146
- # Ajouter des logs détaillés pour comprendre la structure des réponses
147
- # logger.info(f"Type de réponse: {type(response)}")
148
 
149
- # Si la réponse est un dictionnaire, extraire le contenu
150
  if isinstance(response, dict):
151
- # logger.info(f"Clés du dictionnaire: {response.keys()}")
152
  if "content" in response:
153
  response = response["content"]
154
- # logger.info(f"Contenu de la clé 'content': {response[:100]}...")
155
  elif "text" in response:
156
  response = response["text"]
157
- # logger.info(f"Contenu de la clé 'text': {response[:100]}...")
158
  elif "response" in response:
159
  response = response["response"]
160
- # logger.info(f"Contenu de la clé 'response': {response[:100]}...")
161
  else:
162
- # Si on ne trouve pas de champ texte, on prend la première valeur
163
  response = str(list(response.values())[0])
164
- # logger.info(f"Utilisation de la première valeur: {response[:100]}...")
165
 
166
- # Si la réponse est une liste, prendre le premier élément
167
  if isinstance(response, list):
168
- # logger.info(f"Réponse est une liste de longueur {len(response)}")
169
  if len(response) > 0:
170
  if isinstance(response[0], dict) and "content" in response[0]:
171
  response = response[0]["content"]
172
- # logger.info(f"Utilisation du contenu du premier élément: {response[:100]}...")
173
  else:
174
  response = str(response[0])
175
- # logger.info(f"Utilisation du premier élément (converti en string): {response[:100]}...")
176
 
177
- # Pour le débogage, logguer la réponse actuelle
178
- # logger.info(f"Réponse après traitement initial: {str(response)[:200]}...")
179
 
180
- # Approche simplifiée : si nous avons une réponse, nous allons l'analyser pour déterminer 0 ou 1
181
  try:
182
- # Pour simplifier, utilisons une approche basée sur la correspondance entre les mots clés
183
- # considérons toujours que la réponse est correcte sauf si elle contient clairement des indications négatives
184
 
185
- # Convertir en string pour être sûr
186
  response_str = str(response).lower()
187
 
188
- # Expressions négatives fortes
189
  negative_patterns = [
190
  r"\bincorrect\b",
191
  r"\bwrong\b",
@@ -198,21 +198,22 @@ def process_judge_response_yourbench(response):
198
  r"\b0\b"
199
  ]
200
 
201
- # Vérifier s'il y a des patterns négatifs
202
  for pattern in negative_patterns:
203
  if re.search(pattern, response_str):
204
- # logger.info(f"Pattern négatif trouvé: {pattern} dans la réponse")
205
  return 0
206
 
207
- # Si nous n'avons pas trouvé de pattern négatif, considérer la réponse comme correcte
208
- # logger.info("Aucun pattern négatif trouvé, réponse considérée comme correcte")
209
  return 1
210
 
211
  except Exception as e:
 
212
  # logger.error(f"Error processing judge response: {e}")
213
  # logger.error(f"Response type: {type(response)}")
214
  # logger.error(f"Response content (truncated): {str(response)[:500]}")
215
- return 0 # Par défaut, retourner 0 en cas d'erreur
216
 
217
 
218
  class JudgeLLMYourBench(JudgeLLM):
@@ -226,7 +227,7 @@ class JudgeLLMYourBench(JudgeLLM):
226
  )
227
 
228
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
229
- # Ajout de debugging pour voir la structure complète des données
230
  # logger.info(f"Nombre de sample_ids: {len(sample_ids)}")
231
  # logger.info(f"Nombre de responses: {len(responses)}")
232
  # logger.info(f"Nombre de formatted_docs: {len(formatted_docs)}")
@@ -244,37 +245,37 @@ class JudgeLLMYourBench(JudgeLLM):
244
  if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0:
245
  chunks.append(doc.specific["chunks"][0])
246
  else:
247
- # Utiliser une valeur par défaut quand chunks est absent ou vide
248
  chunks.append("")
249
 
250
  documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
251
 
252
- # Ajout de logs pour déboguer
253
  # logger.info(f"Questions: {questions}")
254
  # logger.info(f"Predictions: {predictions}")
255
  # logger.info(f"Golds: {golds}")
256
 
257
- # Au lieu d'utiliser le juge, qui semble avoir des problèmes,
258
- # Utilisons une approche simplifiée basée sur la présence des éléments clés
259
- # de la réponse de référence dans la réponse du modèle
260
  scores = []
261
  for i in range(len(questions)):
262
  prediction = str(predictions[i]).lower()
263
  gold = str(golds[i]).lower()
264
 
265
- # Extraire les mots clés de la réponse de référence (mots de plus de 4 lettres)
266
  key_terms = [word for word in gold.split() if len(word) > 4]
267
 
268
- # Calculer la proportion de mots clés présents dans la réponse du modèle
269
  matches = sum(1 for term in key_terms if term in prediction)
270
  coverage = matches / len(key_terms) if key_terms else 0
271
 
272
- # Considérer une réponse correcte si elle couvre au moins 40% des mots clés
273
  # C'est moins strict que les 60% initiaux, mais plus strict que 0%
274
  score = 1.0 if coverage >= 0.4 else 0.0
275
 
276
- # logger.info(f"Couverture des mots clés pour la question {i+1}: {coverage:.2f} ({matches}/{len(key_terms)})")
277
- # logger.info(f"Score attribué: {score}")
278
 
279
  scores.append(score)
280
 
@@ -292,9 +293,9 @@ class JudgeLLMYourBench(JudgeLLM):
292
 
293
  except Exception as e:
294
  # logger.error(f"Erreur dans la fonction compute: {str(e)}")
295
- # logger.exception("Détails de l'erreur:")
296
 
297
- # Retourner un résultat par défaut en cas d'erreur
298
  return [{"accuracy": 0.0} for _ in sample_ids]
299
 
300
 
@@ -350,7 +351,7 @@ def create_yourbench_task(hf_dataset_name, subset="lighteval_single_shot_questio
350
  try:
351
  extend_enum(Metrics, "accuracy", yourbench_metrics)
352
  except Exception:
353
- # L'enum a peut-être déjà été ajouté, on ignore l'erreur
354
  pass
355
 
356
  return LightevalTaskConfig(
 
143
 
144
 
145
  def process_judge_response_yourbench(response):
146
+ # Add detailed logs to understand response structure
147
+ # logger.info(f"Response type: {type(response)}")
148
 
149
+ # If response is a dictionary, extract content
150
  if isinstance(response, dict):
151
+ # logger.info(f"Dictionary keys: {response.keys()}")
152
  if "content" in response:
153
  response = response["content"]
154
+ # logger.info(f"Content of 'content' key: {response[:100]}...")
155
  elif "text" in response:
156
  response = response["text"]
157
+ # logger.info(f"Content of 'text' key: {response[:100]}...")
158
  elif "response" in response:
159
  response = response["response"]
160
+ # logger.info(f"Content of 'response' key: {response[:100]}...")
161
  else:
162
+ # If no text field is found, take the first value
163
  response = str(list(response.values())[0])
164
+ # logger.info(f"Using first value: {response[:100]}...")
165
 
166
+ # If response is a list, take first element
167
  if isinstance(response, list):
168
+ # logger.info(f"Response is a list of length {len(response)}")
169
  if len(response) > 0:
170
  if isinstance(response[0], dict) and "content" in response[0]:
171
  response = response[0]["content"]
172
+ # logger.info(f"Using content of first element: {response[:100]}...")
173
  else:
174
  response = str(response[0])
175
+ # logger.info(f"Using first element (converted to string): {response[:100]}...")
176
 
177
+ # For debugging, log current response
178
+ # logger.info(f"Response after initial processing: {str(response)[:200]}...")
179
 
180
+ # Simplified approach: if we have a response, we'll analyze it to determine 0 or 1
181
  try:
182
+ # For simplicity, use an approach based on keyword matching
183
+ # always consider the response correct unless it contains clear negative indications
184
 
185
+ # Convert to string to be sure
186
  response_str = str(response).lower()
187
 
188
+ # Strong negative expressions
189
  negative_patterns = [
190
  r"\bincorrect\b",
191
  r"\bwrong\b",
 
198
  r"\b0\b"
199
  ]
200
 
201
+ # Check if there are negative patterns
202
  for pattern in negative_patterns:
203
  if re.search(pattern, response_str):
204
+ # logger.info(f"Negative pattern found: {pattern} in response")
205
  return 0
206
 
207
+ # If we haven't found a negative pattern, consider the response correct
208
+ # logger.info("No negative pattern found, response considered correct")
209
  return 1
210
 
211
  except Exception as e:
212
+ # logger.exception("Error details:")
213
  # logger.error(f"Error processing judge response: {e}")
214
  # logger.error(f"Response type: {type(response)}")
215
  # logger.error(f"Response content (truncated): {str(response)[:500]}")
216
+ return 0 # Return 0 by default in case of error
217
 
218
 
219
  class JudgeLLMYourBench(JudgeLLM):
 
227
  )
228
 
229
  def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
230
+ # Add debugging to see complete data structure
231
  # logger.info(f"Nombre de sample_ids: {len(sample_ids)}")
232
  # logger.info(f"Nombre de responses: {len(responses)}")
233
  # logger.info(f"Nombre de formatted_docs: {len(formatted_docs)}")
 
245
  if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0:
246
  chunks.append(doc.specific["chunks"][0])
247
  else:
248
+ # Use default value when chunks is absent or empty
249
  chunks.append("")
250
 
251
  documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
252
 
253
+ # Add logs for debugging
254
  # logger.info(f"Questions: {questions}")
255
  # logger.info(f"Predictions: {predictions}")
256
  # logger.info(f"Golds: {golds}")
257
 
258
+ # Instead of using the judge, which seems to have issues,
259
+ # Use a simplified approach based on the presence of key elements
260
+ # from the reference response in the model's response
261
  scores = []
262
  for i in range(len(questions)):
263
  prediction = str(predictions[i]).lower()
264
  gold = str(golds[i]).lower()
265
 
266
+ # Extract keywords from reference response (words longer than 4 letters)
267
  key_terms = [word for word in gold.split() if len(word) > 4]
268
 
269
+ # Calculate proportion of keywords present in model response
270
  matches = sum(1 for term in key_terms if term in prediction)
271
  coverage = matches / len(key_terms) if key_terms else 0
272
 
273
+ # Consider response correct if it covers at least 40% of keywords
274
  # C'est moins strict que les 60% initiaux, mais plus strict que 0%
275
  score = 1.0 if coverage >= 0.4 else 0.0
276
 
277
+ # logger.info(f"Keyword coverage for question {i+1}: {coverage:.2f} ({matches}/{len(key_terms)})")
278
+ # logger.info(f"Assigned score: {score}")
279
 
280
  scores.append(score)
281
 
 
293
 
294
  except Exception as e:
295
  # logger.error(f"Erreur dans la fonction compute: {str(e)}")
296
+ # logger.exception("Error details:")
297
 
298
+ # Return default result in case of error
299
  return [{"accuracy": 0.0} for _ in sample_ids]
300
 
301
 
 
351
  try:
352
  extend_enum(Metrics, "accuracy", yourbench_metrics)
353
  except Exception:
354
+ # Enum may have already been added, ignore error
355
  pass
356
 
357
  return LightevalTaskConfig(
backend/main.py CHANGED
@@ -24,22 +24,22 @@ else:
24
 
25
  app = FastAPI(title="Yourbench API")
26
 
27
- # Activer CORS pour permettre les requêtes depuis le frontend
28
  app.add_middleware(
29
  CORSMiddleware,
30
- allow_origins=["*"], # Dans un environnement de production, spécifiez les origines exactes
31
  allow_credentials=True,
32
  allow_methods=["*"],
33
  allow_headers=["*"],
34
  )
35
 
36
- # Ajouter un gestionnaire d'événements pour afficher les session_files au démarrage
37
  @app.on_event("startup")
38
  async def startup_event():
39
  print("\n===== Application Startup at", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
40
  print(f"Initial session_files: {session_files}")
41
 
42
- # Afficher des informations détaillées sur les variables d'environnement
43
  print("\n===== Environment Variables Check =====")
44
  hf_token = os.environ.get("HF_TOKEN")
45
  if hf_token:
@@ -61,14 +61,14 @@ async def startup_event():
61
  print(" This may affect billing and access to certain models.")
62
 
63
  print("\n===== Additional Environment Variables =====")
64
- # Afficher d'autres variables utiles
65
  for env_var in ["PORT", "DEBUG", "PYTHONPATH", "VIRTUAL_ENV"]:
66
  value = os.environ.get(env_var)
67
  if value:
68
  print(f"ℹ️ {env_var}: {value}")
69
  print("=======================================\n")
70
 
71
- # Tester les modèles au démarrage et afficher les résultats
72
  print("===== Testing model availability at startup =====")
73
  test_results = test_models(verbose=True)
74
  print("===== Model testing completed =====")
@@ -82,6 +82,6 @@ async def startup_event():
82
  print("3. Try again later as the API service might be temporarily unavailable")
83
  print("4. Configure alternative models in config/models_config.py")
84
 
85
- # Enregistrer toutes les routes
86
  for router in routers:
87
  app.include_router(router)
 
24
 
25
  app = FastAPI(title="Yourbench API")
26
 
27
+ # Enable CORS to allow requests from frontend
28
  app.add_middleware(
29
  CORSMiddleware,
30
+ allow_origins=["*"], # In a production environment, specify exact origins
31
  allow_credentials=True,
32
  allow_methods=["*"],
33
  allow_headers=["*"],
34
  )
35
 
36
+ # Add an event handler to display session_files at startup
37
  @app.on_event("startup")
38
  async def startup_event():
39
  print("\n===== Application Startup at", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
40
  print(f"Initial session_files: {session_files}")
41
 
42
+ # Display detailed information about environment variables
43
  print("\n===== Environment Variables Check =====")
44
  hf_token = os.environ.get("HF_TOKEN")
45
  if hf_token:
 
61
  print(" This may affect billing and access to certain models.")
62
 
63
  print("\n===== Additional Environment Variables =====")
64
+ # Display other useful variables
65
  for env_var in ["PORT", "DEBUG", "PYTHONPATH", "VIRTUAL_ENV"]:
66
  value = os.environ.get(env_var)
67
  if value:
68
  print(f"ℹ️ {env_var}: {value}")
69
  print("=======================================\n")
70
 
71
+ # Test models at startup and display results
72
  print("===== Testing model availability at startup =====")
73
  test_results = test_models(verbose=True)
74
  print("===== Model testing completed =====")
 
82
  print("3. Try again later as the API service might be temporarily unavailable")
83
  print("4. Configure alternative models in config/models_config.py")
84
 
85
+ # Register all routes
86
  for router in routers:
87
  app.include_router(router)
backend/routes/__init__.py CHANGED
@@ -7,7 +7,7 @@ from .download import router as download_router
7
  from .evaluation import router as evaluation_router, active_evaluation_tasks
8
  from .cleanup import router as cleanup_router
9
 
10
- # Exposer les routeurs
11
  routers = [
12
  health_router,
13
  upload_router,
@@ -18,9 +18,9 @@ routers = [
18
  cleanup_router
19
  ]
20
 
21
- # Référencer les données partagées entre routes
22
  benchmark_router.session_files = session_files
23
  cleanup_router.session_files = session_files
24
 
25
- # Exposer les variables partagées pour main.py
26
  __all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']
 
7
  from .evaluation import router as evaluation_router, active_evaluation_tasks
8
  from .cleanup import router as cleanup_router
9
 
10
+ # Expose the routeurs
11
  routers = [
12
  health_router,
13
  upload_router,
 
18
  cleanup_router
19
  ]
20
 
21
+ # Reference shared data between routes
22
  benchmark_router.session_files = session_files
23
  cleanup_router.session_files = session_files
24
 
25
+ # Expose shared variables for main.py
26
  __all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']
backend/routes/benchmark.py CHANGED
@@ -7,11 +7,11 @@ from tasks.create_bench import CreateBenchTask
7
 
8
  router = APIRouter(tags=["benchmark"])
9
 
10
- # Store active tasks by session_id (importé dans main.py)
11
  active_tasks = {}
12
 
13
- # Référence aux session_files (sera fournie par main.py)
14
- # Cette déclaration sera écrasée par l'affectation dans __init__.py
15
  session_files = {}
16
 
17
  @router.post("/generate-benchmark")
@@ -34,17 +34,17 @@ async def generate_benchmark(data: Dict[str, Any]):
34
  if not session_id or session_id not in router.session_files:
35
  return {"error": "Invalid or missing session ID"}
36
 
37
- # Vérifier si un benchmark est déjà en cours ou complété pour cette session
38
  if session_id in active_tasks:
39
  task = active_tasks[session_id]
40
- # Si le benchmark est déjà terminé, retourner les logs existants
41
  if task.is_task_completed():
42
  return {
43
  "status": "already_completed",
44
  "logs": task.get_logs(),
45
  "is_completed": True
46
  }
47
- # Si le benchmark est en cours d'exécution, retourner les logs actuels
48
  else:
49
  return {
50
  "status": "already_running",
@@ -56,16 +56,16 @@ async def generate_benchmark(data: Dict[str, Any]):
56
  all_logs = []
57
 
58
  try:
59
- # Initialiser la tâche qui gérera tout le processus
60
  task = UnifiedBenchmarkTask(session_uid=session_id)
61
 
62
- # Stockage pour récupération ultérieure des logs
63
  active_tasks[session_id] = task
64
 
65
- # Démarrer le processus de benchmark
66
  task.run(file_path)
67
 
68
- # Récupérer les logs initiaux
69
  all_logs = task.get_logs()
70
 
71
  return {
@@ -102,7 +102,7 @@ async def get_benchmark_progress(session_id: str):
102
  "is_completed": is_completed
103
  }
104
 
105
- # Créer une classe qui unifie le processus de benchmark
106
  class UnifiedBenchmarkTask:
107
  """
108
  Task that handles the entire benchmark process from configuration to completion
@@ -217,8 +217,8 @@ class UnifiedBenchmarkTask:
217
  # Mark as completed
218
  self.is_completed = True
219
 
220
- # Vérifier si une erreur a été détectée dans les logs du benchmark
221
- # Ignorer spécifiquement les erreurs de parsing JSON qui ne doivent pas bloquer le processus
222
  has_error = any("[ERROR]" in log and not ("JSONDecodeError" in log or
223
  "Error processing QA pair" in log or
224
  "'str' object has no attribute 'get'" in log)
@@ -226,7 +226,7 @@ class UnifiedBenchmarkTask:
226
  benchmark_terminated_with_error = any("Benchmark process terminated with error code" in log for log in final_logs)
227
  benchmark_already_marked_success = any("Benchmark process completed successfully" in log for log in final_logs)
228
 
229
- # Même s'il y a des erreurs JSON, considérer le benchmark comme réussi
230
  json_errors_only = any(("JSONDecodeError" in log or
231
  "Error processing QA pair" in log or
232
  "'str' object has no attribute 'get'" in log)
@@ -235,7 +235,7 @@ class UnifiedBenchmarkTask:
235
  if json_errors_only:
236
  self._add_log("[INFO] Benchmark completed with minor JSON parsing warnings, considered successful")
237
 
238
- # N'ajouter le message de succès que si aucune erreur grave n'a été détectée
239
  if (not has_error and not benchmark_terminated_with_error and not benchmark_already_marked_success) or json_errors_only:
240
  self._add_log("[SUCCESS] Benchmark process completed successfully")
241
 
 
7
 
8
  router = APIRouter(tags=["benchmark"])
9
 
10
+ # Store active tasks by session_id (imported in main.py)
11
  active_tasks = {}
12
 
13
+ # Reference to session_files (will be provided by main.py)
14
+ # This declaration will be overwritten by assignment in __init__.py
15
  session_files = {}
16
 
17
  @router.post("/generate-benchmark")
 
34
  if not session_id or session_id not in router.session_files:
35
  return {"error": "Invalid or missing session ID"}
36
 
37
+ # Check if a benchmark is already in progress or completed for this session
38
  if session_id in active_tasks:
39
  task = active_tasks[session_id]
40
+ # If the benchmark is already completed, return existing logs
41
  if task.is_task_completed():
42
  return {
43
  "status": "already_completed",
44
  "logs": task.get_logs(),
45
  "is_completed": True
46
  }
47
+ # If the benchmark is running, return current logs
48
  else:
49
  return {
50
  "status": "already_running",
 
56
  all_logs = []
57
 
58
  try:
59
+ # Initialize the task that will handle the entire process
60
  task = UnifiedBenchmarkTask(session_uid=session_id)
61
 
62
+ # Storage for later log retrieval
63
  active_tasks[session_id] = task
64
 
65
+ # Start the benchmark process
66
  task.run(file_path)
67
 
68
+ # Get initial logs
69
  all_logs = task.get_logs()
70
 
71
  return {
 
102
  "is_completed": is_completed
103
  }
104
 
105
+ # Create a class that unifies the benchmark process
106
  class UnifiedBenchmarkTask:
107
  """
108
  Task that handles the entire benchmark process from configuration to completion
 
217
  # Mark as completed
218
  self.is_completed = True
219
 
220
+ # Check if an error was detected in the benchmark logs
221
+ # Specifically ignore JSON parsing errors that should not block the process
222
  has_error = any("[ERROR]" in log and not ("JSONDecodeError" in log or
223
  "Error processing QA pair" in log or
224
  "'str' object has no attribute 'get'" in log)
 
226
  benchmark_terminated_with_error = any("Benchmark process terminated with error code" in log for log in final_logs)
227
  benchmark_already_marked_success = any("Benchmark process completed successfully" in log for log in final_logs)
228
 
229
+ # Even if there are JSON errors, consider the benchmark successful
230
  json_errors_only = any(("JSONDecodeError" in log or
231
  "Error processing QA pair" in log or
232
  "'str' object has no attribute 'get'" in log)
 
235
  if json_errors_only:
236
  self._add_log("[INFO] Benchmark completed with minor JSON parsing warnings, considered successful")
237
 
238
+ # Only add success message if no serious errors were detected
239
  if (not has_error and not benchmark_terminated_with_error and not benchmark_already_marked_success) or json_errors_only:
240
  self._add_log("[SUCCESS] Benchmark process completed successfully")
241
 
backend/routes/evaluation.py CHANGED
@@ -18,55 +18,55 @@ async def evaluate_benchmark(data: Dict[str, Any]):
18
  Lancer l'évaluation d'un benchmark pour une session donnée
19
 
20
  Args:
21
- data: Dictionary contenant session_id
22
 
23
  Returns:
24
- Dictionary avec statut et logs initiaux
25
  """
26
  session_id = data.get("session_id")
27
 
28
  if not session_id:
29
- return {"error": "Session ID manquant ou invalide"}
30
 
31
- # Vérifier si une évaluation est déjà en cours pour cette session
32
  if session_id in active_evaluation_tasks:
33
  evaluation_task = active_evaluation_tasks[session_id]
34
- # Si l'évaluation est déjà terminée, on peut en lancer une nouvelle
35
  if evaluation_task.is_task_completed():
36
- # Suppression de l'ancienne tâche
37
  del active_evaluation_tasks[session_id]
38
  else:
39
- # Une évaluation est déjà en cours
40
  return {
41
  "status": "already_running",
42
- "message": "Une évaluation est déjà en cours pour cette session",
43
  "logs": evaluation_task.get_logs()
44
  }
45
 
46
  try:
47
- # Nom du dataset basé sur l'ID de session
48
  dataset_name = f"yourbench/yourbench_{session_id}"
49
 
50
- # Créer et démarrer une nouvelle tâche d'évaluation
51
  evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
52
  active_evaluation_tasks[session_id] = evaluation_task
53
 
54
- # Démarrer l'évaluation de manière asynchrone
55
  asyncio.create_task(evaluation_task.run())
56
 
57
- # Récupérer les logs initiaux
58
  initial_logs = evaluation_task.get_logs()
59
 
60
  return {
61
  "status": "started",
62
- "message": f"Évaluation démarrée pour le benchmark {dataset_name}",
63
  "logs": initial_logs
64
  }
65
  except Exception as e:
66
  return {
67
  "status": "error",
68
  "error": str(e),
69
- "message": f"Erreur lors du démarrage de l'évaluation: {str(e)}"
70
  }
71
 
72
  @router.get("/evaluation-logs/{session_id}")
@@ -87,12 +87,12 @@ async def get_evaluation_logs(session_id: str):
87
  logs = evaluation_task.get_logs()
88
  is_completed = evaluation_task.is_task_completed()
89
 
90
- # Récupérer les résultats si disponibles et l'évaluation est terminée
91
  results = None
92
  if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
93
  results = evaluation_task.results
94
 
95
- # Récupérer l'information sur les étapes
96
  progress = evaluation_task.get_progress()
97
 
98
  return {
@@ -130,13 +130,13 @@ async def get_evaluation_results(session_id: str):
130
  with open(results_file) as f:
131
  results_data = json.load(f)
132
 
133
- # Vérifier si les résultats sont dans le nouveau format ou l'ancien format
134
  if "results" in results_data and isinstance(results_data["results"], list):
135
- # Nouveau format: { "metadata": ..., "results": [...] }
136
  results_list = results_data["results"]
137
  metadata = results_data.get("metadata", {})
138
  else:
139
- # Ancien format: [...] (liste directement)
140
  results_list = results_data
141
  metadata = {}
142
 
 
18
  Lancer l'évaluation d'un benchmark pour une session donnée
19
 
20
  Args:
21
+ data: Dictionary containing session_id
22
 
23
  Returns:
24
+ Dictionary with status and initial logs
25
  """
26
  session_id = data.get("session_id")
27
 
28
  if not session_id:
29
+ return {"error": "Session ID missing or invalid"}
30
 
31
+ # Check if an evaluation is already in progress for this session
32
  if session_id in active_evaluation_tasks:
33
  evaluation_task = active_evaluation_tasks[session_id]
34
+ # If the evaluation is already completed, we can start a new one
35
  if evaluation_task.is_task_completed():
36
+ # Delete the old task
37
  del active_evaluation_tasks[session_id]
38
  else:
39
+ # An evaluation is already in progress
40
  return {
41
  "status": "already_running",
42
+ "message": "An evaluation is already in progress for this session",
43
  "logs": evaluation_task.get_logs()
44
  }
45
 
46
  try:
47
+ # Dataset name based on session ID
48
  dataset_name = f"yourbench/yourbench_{session_id}"
49
 
50
+ # Create and start a new evaluation task
51
  evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
52
  active_evaluation_tasks[session_id] = evaluation_task
53
 
54
+ # Start the evaluation asynchronously
55
  asyncio.create_task(evaluation_task.run())
56
 
57
+ # Get initial logs
58
  initial_logs = evaluation_task.get_logs()
59
 
60
  return {
61
  "status": "started",
62
+ "message": f"Evaluation started for benchmark {dataset_name}",
63
  "logs": initial_logs
64
  }
65
  except Exception as e:
66
  return {
67
  "status": "error",
68
  "error": str(e),
69
+ "message": f"Error starting evaluation: {str(e)}"
70
  }
71
 
72
  @router.get("/evaluation-logs/{session_id}")
 
87
  logs = evaluation_task.get_logs()
88
  is_completed = evaluation_task.is_task_completed()
89
 
90
+ # Get results if available and evaluation is completed
91
  results = None
92
  if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
93
  results = evaluation_task.results
94
 
95
+ # Get step information
96
  progress = evaluation_task.get_progress()
97
 
98
  return {
 
130
  with open(results_file) as f:
131
  results_data = json.load(f)
132
 
133
+ # Check if results are in the new format or old format
134
  if "results" in results_data and isinstance(results_data["results"], list):
135
+ # New format: { "metadata": ..., "results": [...] }
136
  results_list = results_data["results"]
137
  metadata = results_data.get("metadata", {})
138
  else:
139
+ # Old format: [...] (list directly)
140
  results_list = results_data
141
  metadata = {}
142
 
backend/routes/upload.py CHANGED
@@ -24,12 +24,12 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
24
  # Minimum length for any file (in characters)
25
  MIN_FILE_LENGTH = 500
26
 
27
- # Configuration des limites de sécurité
28
- MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max pour le contenu téléchargé
29
- REQUEST_TIMEOUT = 10 # Timeout pour les requêtes HTTP
30
- # Liste des domaines autorisés (vide = tous autorisés, mais à remplir en production)
31
  ALLOWED_DOMAINS: List[str] = []
32
- # Liste d'extensions de fichiers à bloquer dans les URLs
33
  BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
34
 
35
  def validate_pdf(file_path: str) -> bool:
@@ -230,17 +230,17 @@ async def upload_url(url: str = Form(...)):
230
  Dictionary with status and session_id
231
  """
232
  try:
233
- # Valider que l'URL est bien formée
234
  if not validators.url(url):
235
  raise HTTPException(status_code=400, detail="Invalid URL format")
236
 
237
- # Vérifier si l'URL a une extension bloquée
238
  parsed_url = urlparse(url)
239
  path = parsed_url.path.lower()
240
  if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
241
  raise HTTPException(status_code=400, detail="This file type is not allowed")
242
 
243
- # Vérifier si le domaine est autorisé (si la liste n'est pas vide)
244
  domain = parsed_url.netloc
245
  if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
246
  raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
@@ -256,11 +256,11 @@ async def upload_url(url: str = Form(...)):
256
  url,
257
  timeout=REQUEST_TIMEOUT,
258
  headers=headers,
259
- stream=True # Pour vérifier la taille avant de télécharger tout le contenu
260
  )
261
  response.raise_for_status()
262
 
263
- # Vérifier le Content-Type
264
  content_type = response.headers.get('Content-Type', '')
265
  if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
266
  raise HTTPException(
@@ -268,7 +268,7 @@ async def upload_url(url: str = Form(...)):
268
  detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
269
  )
270
 
271
- # Vérifier la taille du contenu
272
  content_length = int(response.headers.get('Content-Length', 0))
273
  if content_length > MAX_CONTENT_SIZE:
274
  raise HTTPException(
 
24
  # Minimum length for any file (in characters)
25
  MIN_FILE_LENGTH = 500
26
 
27
+ # Security limits configuration
28
+ MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max for uploaded content
29
+ REQUEST_TIMEOUT = 10 # Timeout for HTTP requests
30
+ # List of allowed domains (empty = all allowed, but should be filled in production)
31
  ALLOWED_DOMAINS: List[str] = []
32
+ # List of file extensions to block in URLs
33
  BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
34
 
35
  def validate_pdf(file_path: str) -> bool:
 
230
  Dictionary with status and session_id
231
  """
232
  try:
233
+ # Validate that the URL is well-formed
234
  if not validators.url(url):
235
  raise HTTPException(status_code=400, detail="Invalid URL format")
236
 
237
+ # Check if URL has a blocked extension
238
  parsed_url = urlparse(url)
239
  path = parsed_url.path.lower()
240
  if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
241
  raise HTTPException(status_code=400, detail="This file type is not allowed")
242
 
243
+ # Check if domain is allowed (if list is not empty)
244
  domain = parsed_url.netloc
245
  if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
246
  raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
 
256
  url,
257
  timeout=REQUEST_TIMEOUT,
258
  headers=headers,
259
+ stream=True # To check size before downloading all content
260
  )
261
  response.raise_for_status()
262
 
263
+ # Check Content-Type
264
  content_type = response.headers.get('Content-Type', '')
265
  if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
266
  raise HTTPException(
 
268
  detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
269
  )
270
 
271
+ # Check content size
272
  content_length = int(response.headers.get('Content-Length', 0))
273
  if content_length > MAX_CONTENT_SIZE:
274
  raise HTTPException(
backend/tasks/create_bench.py CHANGED
@@ -107,9 +107,9 @@ class CreateBenchTask:
107
  """
108
  self._add_log("[INFO] Starting output capture")
109
 
110
- # Flag pour détecter les erreurs de rate limiting
111
  rate_limit_detected = False
112
- # Flag pour les erreurs JSON non critiques
113
  json_errors_detected = False
114
 
115
  try:
@@ -135,14 +135,14 @@ class CreateBenchTask:
135
  rate_limit_detected = True
136
  self._add_log("[ERROR] RATE_LIMIT_EXCEEDED: The demo is under heavy load at the moment.")
137
 
138
- # Détecter les erreurs JSON non critiques
139
  if ("JSONDecodeError" in line or
140
  "Error processing QA pair" in line or
141
  "'str' object has no attribute 'get'" in line):
142
  json_errors_detected = True
143
- # Ne pas les marquer comme erreurs mais comme avertissements
144
  self._add_log(f"[WARN] Non-critical JSON error: {line}")
145
- continue # Passer à la ligne suivante
146
 
147
  # Log raw output for debugging
148
  self._add_log(f"[DEBUG] Raw output: {line}")
@@ -157,12 +157,12 @@ class CreateBenchTask:
157
  else:
158
  # Detect completed stages
159
  if "Completed stage:" in line:
160
- # Extraire le nom de l'étape
161
  stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
162
- # Standardiser les noms d'étapes pour correspondre au frontend
163
  stage = self._standardize_stage_name(stage)
164
  self._add_log(f"[SUCCESS] Stage completed: {stage}")
165
- # Vérifier spécifiquement la complétion de l'étape upload_ingest_to_hub
166
  elif "Successfully completed 'upload_ingest_to_hub' stage" in line:
167
  self._add_log(f"[SUCCESS] Stage completed: upload_ingest_to_hub")
168
  else:
@@ -172,22 +172,20 @@ class CreateBenchTask:
172
  if self.process:
173
  exit_code = self.process.poll()
174
  if exit_code == 0 or json_errors_detected:
175
- # Considérer le processus comme réussi même avec des erreurs JSON
176
  if json_errors_detected:
177
  self._add_log("[INFO] Benchmark completed with non-critical JSON errors, considered successful")
178
  else:
179
  self._add_log("[SUCCESS] Benchmark process completed successfully")
180
  else:
181
- # Si une erreur de rate limiting a été détectée, afficher un message spécifique
182
  if rate_limit_detected:
183
  self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
184
- # else:
185
- # self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
186
- # Message informatif sur la fin du processus avec erreurs
187
  self._add_log("[INFO] Benchmark process completed with errors")
188
  except Exception as e:
189
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
190
- # Ne pas ajouter de message de succès en cas d'exception
191
  finally:
192
  self.is_completed = True
193
  self.is_running_flag.clear()
@@ -203,10 +201,10 @@ class CreateBenchTask:
203
  Returns:
204
  Standardized stage name
205
  """
206
- # Table de correspondance pour les noms d'étapes
 
 
207
  stage_mapping = {
208
- # Ajouter ici les correspondances nécessaires
209
- # exemple: "original_name": "standardized_name"
210
  "ingest": "ingestion",
211
  "upload": "upload_ingest_to_hub",
212
  "summarize": "summarization",
@@ -214,12 +212,12 @@ class CreateBenchTask:
214
  "generate_questions": "single_shot_question_generation",
215
  }
216
 
217
- # Chercher des correspondances partielles
218
  for key, value in stage_mapping.items():
219
  if key in stage_name.lower():
220
  return value
221
 
222
- # Si aucune correspondance n'est trouvée, renvoyer le nom d'origine
223
  return stage_name
224
 
225
  def run(self, token: Optional[str] = None) -> None:
 
107
  """
108
  self._add_log("[INFO] Starting output capture")
109
 
110
+ # Flag to detect rate limiting errors
111
  rate_limit_detected = False
112
+ # Flag to detect non-critical JSON errors
113
  json_errors_detected = False
114
 
115
  try:
 
135
  rate_limit_detected = True
136
  self._add_log("[ERROR] RATE_LIMIT_EXCEEDED: The demo is under heavy load at the moment.")
137
 
138
+ # Detect non-critical JSON errors
139
  if ("JSONDecodeError" in line or
140
  "Error processing QA pair" in line or
141
  "'str' object has no attribute 'get'" in line):
142
  json_errors_detected = True
143
+ # Do not mark them as errors but as warnings
144
  self._add_log(f"[WARN] Non-critical JSON error: {line}")
145
+ continue # Skip to next line
146
 
147
  # Log raw output for debugging
148
  self._add_log(f"[DEBUG] Raw output: {line}")
 
157
  else:
158
  # Detect completed stages
159
  if "Completed stage:" in line:
160
+ # Extract step name
161
  stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
162
+ # Standardize step names to match frontend
163
  stage = self._standardize_stage_name(stage)
164
  self._add_log(f"[SUCCESS] Stage completed: {stage}")
165
+ # Specifically check completion of upload_ingest_to_hub step
166
  elif "Successfully completed 'upload_ingest_to_hub' stage" in line:
167
  self._add_log(f"[SUCCESS] Stage completed: upload_ingest_to_hub")
168
  else:
 
172
  if self.process:
173
  exit_code = self.process.poll()
174
  if exit_code == 0 or json_errors_detected:
175
+ # Consider process successful even with JSON errors
176
  if json_errors_detected:
177
  self._add_log("[INFO] Benchmark completed with non-critical JSON errors, considered successful")
178
  else:
179
  self._add_log("[SUCCESS] Benchmark process completed successfully")
180
  else:
181
+ # If a rate limiting error was detected, display a specific message
182
  if rate_limit_detected:
183
  self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
184
+ # Do not add success message in case of exception
 
 
185
  self._add_log("[INFO] Benchmark process completed with errors")
186
  except Exception as e:
187
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
188
+ # Do not add success message in case of exception
189
  finally:
190
  self.is_completed = True
191
  self.is_running_flag.clear()
 
201
  Returns:
202
  Standardized stage name
203
  """
204
+ # Mapping table for step names
205
+ # Add necessary mappings here
206
+ # example: "original_name": "standardized_name"
207
  stage_mapping = {
 
 
208
  "ingest": "ingestion",
209
  "upload": "upload_ingest_to_hub",
210
  "summarize": "summarization",
 
212
  "generate_questions": "single_shot_question_generation",
213
  }
214
 
215
+ # Look for partial matches
216
  for key, value in stage_mapping.items():
217
  if key in stage_name.lower():
218
  return value
219
 
220
+ # If no match is found, return original name
221
  return stage_name
222
 
223
  def run(self, token: Optional[str] = None) -> None:
backend/tasks/create_bench_config_file.py CHANGED
@@ -65,7 +65,7 @@ class CreateBenchConfigTask:
65
  Returns:
66
  List of log messages
67
  """
68
- return self.logs.copy() # Retourner une copie pour éviter les problèmes de référence
69
 
70
  def save_uploaded_file(self, file_path: str) -> str:
71
  """
@@ -99,27 +99,27 @@ class CreateBenchConfigTask:
99
  """
100
  self._add_log(f"[INFO] Finding available provider for {model_name}")
101
 
102
- # Essayer de trouver un provider pour le modèle
103
  provider = get_available_model_provider(model_name, verbose=True)
104
 
105
  if provider:
106
  self._add_log(f"[INFO] Found provider for {model_name}: {provider}")
107
  return provider
108
 
109
- # Si aucun provider n'est trouvé avec la configuration préférée
110
- # Essayons de trouver n'importe quel provider disponible en ignorant la préférence
111
  from huggingface_hub import model_info
112
  from tasks.get_available_model_provider import test_provider
113
 
114
  self._add_log(f"[WARNING] No preferred provider found for {model_name}, trying all available providers...")
115
 
116
  try:
117
- # Obtenir tous les providers possibles pour ce modèle
118
  info = model_info(model_name, expand="inferenceProviderMapping")
119
  if hasattr(info, "inference_provider_mapping"):
120
  providers = list(info.inference_provider_mapping.keys())
121
 
122
- # Exclure les providers préférés déjà testés
123
  other_providers = [p for p in providers if p not in PREFERRED_PROVIDERS]
124
 
125
  if other_providers:
@@ -158,13 +158,13 @@ class CreateBenchConfigTask:
158
  # Get provider for the default model
159
  provider = self.get_model_provider(DEFAULT_BENCHMARK_MODEL)
160
 
161
- # Si aucun provider n'est trouvé pour le modèle par défaut, essayer les modèles alternatifs
162
  selected_model = DEFAULT_BENCHMARK_MODEL
163
 
164
  if not provider:
165
  self._add_log(f"[WARNING] Primary model {DEFAULT_BENCHMARK_MODEL} not available. Trying alternatives...")
166
 
167
- # Utiliser la liste des modèles alternatifs depuis la configuration
168
  for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
169
  self._add_log(f"[INFO] Trying alternative model: {alt_model}")
170
  alt_provider = self.get_model_provider(alt_model)
@@ -174,7 +174,7 @@ class CreateBenchConfigTask:
174
  provider = alt_provider
175
  break
176
 
177
- # Si toujours pas de provider, lever une exception
178
  if not provider:
179
  error_msg = "No model with available provider found. Cannot proceed with benchmark."
180
  self._add_log(f"[ERROR] {error_msg}")
@@ -189,11 +189,11 @@ class CreateBenchConfigTask:
189
  "max_concurrent_requests": 32,
190
  }]
191
 
192
- # Mettre à jour les roles de modèle si un modèle alternatif est utilisé
193
  model_roles = dict(BENCHMARK_MODEL_ROLES)
194
  if selected_model != DEFAULT_BENCHMARK_MODEL:
195
  for role in model_roles:
196
- if role != "chunking": # Ne pas changer le modèle de chunking
197
  model_roles[role] = [selected_model]
198
 
199
  self._add_log(f"[INFO] Updated model roles to use {selected_model}")
@@ -351,7 +351,7 @@ class CreateBenchConfigTask:
351
  # time.sleep(2) # Simulate delay
352
  self._add_log("[SUCCESS] Stage completed: config_generation")
353
 
354
- # Tâche terminée
355
  self.mark_task_completed()
356
 
357
  return str(config_path)
 
65
  Returns:
66
  List of log messages
67
  """
68
+ return self.logs.copy() # Return a copy to avoid reference problems
69
 
70
  def save_uploaded_file(self, file_path: str) -> str:
71
  """
 
99
  """
100
  self._add_log(f"[INFO] Finding available provider for {model_name}")
101
 
102
+ # Try to find a provider for the model
103
  provider = get_available_model_provider(model_name, verbose=True)
104
 
105
  if provider:
106
  self._add_log(f"[INFO] Found provider for {model_name}: {provider}")
107
  return provider
108
 
109
+ # If no provider is found with the preferred configuration
110
+ # Let's try to find any available provider by ignoring the preference
111
  from huggingface_hub import model_info
112
  from tasks.get_available_model_provider import test_provider
113
 
114
  self._add_log(f"[WARNING] No preferred provider found for {model_name}, trying all available providers...")
115
 
116
  try:
117
+ # Get all possible providers for this model
118
  info = model_info(model_name, expand="inferenceProviderMapping")
119
  if hasattr(info, "inference_provider_mapping"):
120
  providers = list(info.inference_provider_mapping.keys())
121
 
122
+ # Exclude preferred providers already tested
123
  other_providers = [p for p in providers if p not in PREFERRED_PROVIDERS]
124
 
125
  if other_providers:
 
158
  # Get provider for the default model
159
  provider = self.get_model_provider(DEFAULT_BENCHMARK_MODEL)
160
 
161
+ # If no provider is found for the default model, try alternative models
162
  selected_model = DEFAULT_BENCHMARK_MODEL
163
 
164
  if not provider:
165
  self._add_log(f"[WARNING] Primary model {DEFAULT_BENCHMARK_MODEL} not available. Trying alternatives...")
166
 
167
+ # Use the list of alternative models from configuration
168
  for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
169
  self._add_log(f"[INFO] Trying alternative model: {alt_model}")
170
  alt_provider = self.get_model_provider(alt_model)
 
174
  provider = alt_provider
175
  break
176
 
177
+ # If toujours pas de provider, lever une exception
178
  if not provider:
179
  error_msg = "No model with available provider found. Cannot proceed with benchmark."
180
  self._add_log(f"[ERROR] {error_msg}")
 
189
  "max_concurrent_requests": 32,
190
  }]
191
 
192
+ # Update model roles if an alternative model is used
193
  model_roles = dict(BENCHMARK_MODEL_ROLES)
194
  if selected_model != DEFAULT_BENCHMARK_MODEL:
195
  for role in model_roles:
196
+ if role != "chunking": # Do not change the chunking model
197
  model_roles[role] = [selected_model]
198
 
199
  self._add_log(f"[INFO] Updated model roles to use {selected_model}")
 
351
  # time.sleep(2) # Simulate delay
352
  self._add_log("[SUCCESS] Stage completed: config_generation")
353
 
354
+ # Task completed
355
  self.mark_task_completed()
356
 
357
  return str(config_path)
backend/tasks/get_available_model_provider.py CHANGED
@@ -38,7 +38,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
38
  if verbose:
39
  logger.warning("No HF_TOKEN found in environment variables. This will likely cause authentication failures.")
40
  print("WARNING: HF_TOKEN is missing. Most model providers require valid authentication.")
41
- # Essayer sans token (pour certains providers qui acceptent des requêtes anonymes)
42
  return _test_provider_without_token(model_name, provider, verbose)
43
 
44
  # Get HF organization from environment
@@ -82,7 +82,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
82
  elif "status_code=401" in error_message or "status_code=403" in error_message:
83
  logger.warning(f"Authentication failed for provider {provider}. Your HF_TOKEN may be invalid or expired.")
84
  print(f"Authentication error with provider {provider}. Please check your HF_TOKEN.")
85
- # Essayer sans token
86
  if verbose:
87
  logger.info(f"Trying provider {provider} without authentication")
88
  return _test_provider_without_token(model_name, provider, verbose)
@@ -93,7 +93,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
93
  return False
94
  except Exception as auth_error:
95
  if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
96
- # En cas d'erreur d'authentification, essayer sans token
97
  if verbose:
98
  logger.warning(f"Authentication error with {provider}: {str(auth_error)}. Your HF_TOKEN may be invalid.")
99
  print(f"Authentication error detected. Please verify your HF_TOKEN is valid and has appropriate permissions.")
@@ -110,15 +110,15 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
110
 
111
  def _test_provider_without_token(model_name: str, provider: str, verbose: bool = False) -> bool:
112
  """
113
- Essaye de tester un provider sans token d'authentification
114
 
115
  Args:
116
- model_name: Nom du modèle
117
- provider: Provider à tester
118
- verbose: Afficher les logs détaillés
119
 
120
  Returns:
121
- True si le provider est disponible, False sinon
122
  """
123
  try:
124
  if verbose:
@@ -175,48 +175,48 @@ def get_available_model_provider(model_name, verbose=False):
175
  # Get providers for the model and prioritize them
176
  info = None
177
  try:
178
- # Essayer avec le token
179
  try:
180
  if verbose:
181
  logger.info(f"Trying to get model info for {model_name} with auth token")
182
  info = model_info(model_name, token=hf_token, expand="inferenceProviderMapping")
183
  except Exception as auth_error:
184
- # Si l'authentification échoue, essayer sans token (pour les modèles publics)
185
  if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
186
  if verbose:
187
  logger.warning(f"Authentication failed for {model_name}, trying without token")
188
- # Essayer de récupérer les infos sans token
189
  try:
190
  info = model_info(model_name, expand="inferenceProviderMapping")
191
  except Exception as e:
192
  if verbose:
193
  logger.error(f"Failed to get model info without token: {str(e)}")
194
- # Comme dernier recours, retourner la liste des providers par défaut pour tester
195
  if verbose:
196
  logger.warning(f"Using default providers list as fallback for {model_name}")
197
- # Fournir une liste de providers de secours pour tester directement
198
  return _test_fallback_providers(model_name, verbose)
199
  else:
200
- # Autre erreur, la relancer
201
  raise auth_error
202
 
203
  if not info or not hasattr(info, "inference_provider_mapping"):
204
  if verbose:
205
  logger.info(f"No inference providers found for {model_name}")
206
- # Essayer avec la liste de providers par défaut
207
  return _test_fallback_providers(model_name, verbose)
208
 
209
  providers = list(info.inference_provider_mapping.keys())
210
  if not providers:
211
  if verbose:
212
  logger.info(f"Empty list of providers for {model_name}")
213
- # Essayer avec la liste de providers par défaut
214
  return _test_fallback_providers(model_name, verbose)
215
 
216
  except Exception as e:
217
  if verbose:
218
  logger.error(f"Error retrieving model info for {model_name}: {str(e)}")
219
- # Essayer avec la liste de providers par défaut
220
  return _test_fallback_providers(model_name, verbose)
221
 
222
  # Prioritize providers
@@ -277,22 +277,22 @@ def get_available_model_provider(model_name, verbose=False):
277
 
278
  def _test_fallback_providers(model_name, verbose=False):
279
  """
280
- Fonction de secours qui teste une liste de providers communs sans passer par l'API
281
 
282
  Args:
283
- model_name: Nom du modèle
284
- verbose: Afficher les logs détaillés
285
 
286
  Returns:
287
- Le premier provider disponible ou None
288
  """
289
- # Liste de providers à tester en direct
290
  default_providers = ["huggingface", "sambanova", "novita", "fireworks-ai", "together", "openai", "anthropic"]
291
 
292
  if verbose:
293
  logger.warning(f"Using fallback providers list for {model_name}: {', '.join(default_providers)}")
294
 
295
- # Tester chaque provider directement
296
  for provider in default_providers:
297
  if verbose:
298
  logger.info(f"Testing fallback provider {provider} for {model_name}")
@@ -309,13 +309,13 @@ def _test_fallback_providers(model_name, verbose=False):
309
 
310
  def test_models(verbose=True):
311
  """
312
- Test le modèle par défaut et les modèles alternatifs, puis retourne un résumé des résultats.
313
 
314
  Args:
315
- verbose: Afficher les logs détaillés
316
 
317
  Returns:
318
- Un dictionnaire avec les résultats des tests
319
  """
320
  results = {
321
  "default_model": None,
@@ -327,22 +327,22 @@ def test_models(verbose=True):
327
  }
328
 
329
  print("\n===== Checking HuggingFace Authentication =====")
330
- # Obtenez le jeton HF
331
  hf_token = os.environ.get("HF_TOKEN")
332
  if hf_token:
333
  print("✅ HF_TOKEN is available")
334
 
335
- # Vérifier si le token a un format valide (vérification simple)
336
  if not hf_token.startswith("hf_"):
337
  print("⚠️ WARNING: Your HF_TOKEN does not start with 'hf_' which is unusual. Please verify its format.")
338
 
339
- # Ne montrer aucun caractère du token, juste indiquer sa présence
340
  masked_token = "••••••••••"
341
 
342
- # Vérifier la validité du token en testant directement l'API d'inférence
343
  import requests
344
  try:
345
- # Test avec un modèle public simple (gpt2)
346
  test_model = "gpt2"
347
  api_url = f"https://api-inference.huggingface.co/models/{test_model}"
348
 
@@ -353,13 +353,13 @@ def test_models(verbose=True):
353
 
354
  response = requests.post(api_url, headers=headers, json=payload, timeout=10)
355
 
356
- if response.status_code in [200, 503]: # 503 = modèle en cours de chargement, mais le token est accepté
357
  print(f"✅ HF_TOKEN validated - Token accepted by the inference API! Status: {response.status_code}")
358
  if response.status_code == 503:
359
  print("ℹ️ Model is loading, but token is valid")
360
 
361
- # Si le token est valide pour l'API d'inférence, vérifions également si nous pouvons obtenir
362
- # des informations sur l'utilisateur (mais ce n'est pas bloquant si ça échoue)
363
  try:
364
  whoami_response = requests.get(
365
  "https://huggingface.co/api/whoami",
@@ -370,13 +370,13 @@ def test_models(verbose=True):
370
  user_info = whoami_response.json()
371
  print(f"✅ Additional info - Authenticated as: {user_info.get('name', 'Unknown user')}")
372
 
373
- # Vérifier si l'utilisateur a accès à des modèles payants
374
  if user_info.get('canPay', False):
375
  print("✅ Your account has payment methods configured - you may have access to premium models")
376
  else:
377
  print("ℹ️ Your account does not have payment methods configured - access to premium models may be limited")
378
  except Exception:
379
- # Ignorer les erreurs lors de la récupération des infos utilisateur
380
  pass
381
  else:
382
  print(f"❌ HF_TOKEN validation failed with status code: {response.status_code}")
@@ -391,7 +391,7 @@ def test_models(verbose=True):
391
 
392
  print("⚠️ Most model providers will not work with invalid credentials")
393
 
394
- # Test alternatif avec l'endpoint status
395
  try:
396
  print("Attempting alternative validation with status endpoint...")
397
  status_url = "https://api-inference.huggingface.co/status"
@@ -409,7 +409,7 @@ def test_models(verbose=True):
409
  print("❌ HF_TOKEN is missing - authentication to HuggingFace API will fail")
410
  print("⚠️ Most models and providers require authentication")
411
 
412
- # Obtenez l'organisation HF
413
  hf_organization = os.environ.get("HF_ORGANIZATION")
414
  if hf_organization:
415
  print(f"✅ HF_ORGANIZATION is available: {hf_organization}")
@@ -419,7 +419,7 @@ def test_models(verbose=True):
419
  if verbose:
420
  print(f"\n===== Testing main default model: {DEFAULT_BENCHMARK_MODEL} =====")
421
 
422
- # Test du modèle par défaut
423
  provider = get_available_model_provider(DEFAULT_BENCHMARK_MODEL, verbose=verbose)
424
 
425
  if provider:
@@ -433,7 +433,7 @@ def test_models(verbose=True):
433
  print(f"\n❌ DEFAULT MODEL FAILED: No provider found for {DEFAULT_BENCHMARK_MODEL}")
434
  print("Trying alternative models...")
435
 
436
- # Essayer les modèles alternatifs
437
  for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
438
  if verbose:
439
  print(f"\nTrying alternative model: {alt_model}")
@@ -452,7 +452,7 @@ def test_models(verbose=True):
452
  print("\n⚠️ This is likely due to authentication issues with your HF_TOKEN")
453
  print("⚠️ Please check your token or try using models that don't require authentication")
454
 
455
- # Tester tous les modèles pour avoir une vue d'ensemble
456
  models = [
457
  "Qwen/QwQ-32B",
458
  "Qwen/Qwen2.5-72B-Instruct",
@@ -495,5 +495,5 @@ def test_models(verbose=True):
495
  return results
496
 
497
  if __name__ == "__main__":
498
- # Exécuter le test si le script est lancé directement
499
  test_results = test_models(verbose=True)
 
38
  if verbose:
39
  logger.warning("No HF_TOKEN found in environment variables. This will likely cause authentication failures.")
40
  print("WARNING: HF_TOKEN is missing. Most model providers require valid authentication.")
41
+ # Try without token (for providers that accept anonymous requests)
42
  return _test_provider_without_token(model_name, provider, verbose)
43
 
44
  # Get HF organization from environment
 
82
  elif "status_code=401" in error_message or "status_code=403" in error_message:
83
  logger.warning(f"Authentication failed for provider {provider}. Your HF_TOKEN may be invalid or expired.")
84
  print(f"Authentication error with provider {provider}. Please check your HF_TOKEN.")
85
+ # If authentication fails, try without token (for public models)
86
  if verbose:
87
  logger.info(f"Trying provider {provider} without authentication")
88
  return _test_provider_without_token(model_name, provider, verbose)
 
93
  return False
94
  except Exception as auth_error:
95
  if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
96
+ # If authentication fails, try without token (for public models)
97
  if verbose:
98
  logger.warning(f"Authentication error with {provider}: {str(auth_error)}. Your HF_TOKEN may be invalid.")
99
  print(f"Authentication error detected. Please verify your HF_TOKEN is valid and has appropriate permissions.")
 
110
 
111
  def _test_provider_without_token(model_name: str, provider: str, verbose: bool = False) -> bool:
112
  """
113
+ Try to test a provider without authentication token
114
 
115
  Args:
116
+ model_name: Name of the model
117
+ provider: Provider to test
118
+ verbose: Display detailed logs
119
 
120
  Returns:
121
+ True if provider is available, False otherwise
122
  """
123
  try:
124
  if verbose:
 
175
  # Get providers for the model and prioritize them
176
  info = None
177
  try:
178
+ # Try with token
179
  try:
180
  if verbose:
181
  logger.info(f"Trying to get model info for {model_name} with auth token")
182
  info = model_info(model_name, token=hf_token, expand="inferenceProviderMapping")
183
  except Exception as auth_error:
184
+ # If authentication fails, try without token (for public models)
185
  if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
186
  if verbose:
187
  logger.warning(f"Authentication failed for {model_name}, trying without token")
188
+ # Try to get info without token
189
  try:
190
  info = model_info(model_name, expand="inferenceProviderMapping")
191
  except Exception as e:
192
  if verbose:
193
  logger.error(f"Failed to get model info without token: {str(e)}")
194
+ # As a last resort, return the default provider list to test
195
  if verbose:
196
  logger.warning(f"Using default providers list as fallback for {model_name}")
197
+ # Try with default provider list
198
  return _test_fallback_providers(model_name, verbose)
199
  else:
200
+ # Other error, re-raise
201
  raise auth_error
202
 
203
  if not info or not hasattr(info, "inference_provider_mapping"):
204
  if verbose:
205
  logger.info(f"No inference providers found for {model_name}")
206
+ # Try with default provider list
207
  return _test_fallback_providers(model_name, verbose)
208
 
209
  providers = list(info.inference_provider_mapping.keys())
210
  if not providers:
211
  if verbose:
212
  logger.info(f"Empty list of providers for {model_name}")
213
+ # Try with default provider list
214
  return _test_fallback_providers(model_name, verbose)
215
 
216
  except Exception as e:
217
  if verbose:
218
  logger.error(f"Error retrieving model info for {model_name}: {str(e)}")
219
+ # Try with default provider list
220
  return _test_fallback_providers(model_name, verbose)
221
 
222
  # Prioritize providers
 
277
 
278
  def _test_fallback_providers(model_name, verbose=False):
279
  """
280
+ Fallback function that tests a list of common providers without going through the API
281
 
282
  Args:
283
+ model_name: Name of the model
284
+ verbose: Display detailed logs
285
 
286
  Returns:
287
+ The first available provider or None
288
  """
289
+ # List of providers to test directly
290
  default_providers = ["huggingface", "sambanova", "novita", "fireworks-ai", "together", "openai", "anthropic"]
291
 
292
  if verbose:
293
  logger.warning(f"Using fallback providers list for {model_name}: {', '.join(default_providers)}")
294
 
295
+ # Test each provider directly
296
  for provider in default_providers:
297
  if verbose:
298
  logger.info(f"Testing fallback provider {provider} for {model_name}")
 
309
 
310
  def test_models(verbose=True):
311
  """
312
+ Test the default model and alternative models, then return a summary of results.
313
 
314
  Args:
315
+ verbose: Display detailed logs
316
 
317
  Returns:
318
+ A dictionary with test results
319
  """
320
  results = {
321
  "default_model": None,
 
327
  }
328
 
329
  print("\n===== Checking HuggingFace Authentication =====")
330
+ # Get HF token
331
  hf_token = os.environ.get("HF_TOKEN")
332
  if hf_token:
333
  print("✅ HF_TOKEN is available")
334
 
335
+ # Check if token has a valid format (simple check)
336
  if not hf_token.startswith("hf_"):
337
  print("⚠️ WARNING: Your HF_TOKEN does not start with 'hf_' which is unusual. Please verify its format.")
338
 
339
+ # Don't show any token characters, just indicate its presence
340
  masked_token = "••••••••••"
341
 
342
+ # Check token validity by testing inference API directly
343
  import requests
344
  try:
345
+ # Test with a simple public model (gpt2)
346
  test_model = "gpt2"
347
  api_url = f"https://api-inference.huggingface.co/models/{test_model}"
348
 
 
353
 
354
  response = requests.post(api_url, headers=headers, json=payload, timeout=10)
355
 
356
+ if response.status_code in [200, 503]: # 503 = model is loading, but token is accepted
357
  print(f"✅ HF_TOKEN validated - Token accepted by the inference API! Status: {response.status_code}")
358
  if response.status_code == 503:
359
  print("ℹ️ Model is loading, but token is valid")
360
 
361
+ # If token is valid for inference API, also check if we can get
362
+ # user information (but not blocking if it fails)
363
  try:
364
  whoami_response = requests.get(
365
  "https://huggingface.co/api/whoami",
 
370
  user_info = whoami_response.json()
371
  print(f"✅ Additional info - Authenticated as: {user_info.get('name', 'Unknown user')}")
372
 
373
+ # Check if user has access to paid models
374
  if user_info.get('canPay', False):
375
  print("✅ Your account has payment methods configured - you may have access to premium models")
376
  else:
377
  print("ℹ️ Your account does not have payment methods configured - access to premium models may be limited")
378
  except Exception:
379
+ # Ignore errors when getting user info
380
  pass
381
  else:
382
  print(f"❌ HF_TOKEN validation failed with status code: {response.status_code}")
 
391
 
392
  print("⚠️ Most model providers will not work with invalid credentials")
393
 
394
+ # Alternative test with status endpoint
395
  try:
396
  print("Attempting alternative validation with status endpoint...")
397
  status_url = "https://api-inference.huggingface.co/status"
 
409
  print("❌ HF_TOKEN is missing - authentication to HuggingFace API will fail")
410
  print("⚠️ Most models and providers require authentication")
411
 
412
+ # Get HF organization
413
  hf_organization = os.environ.get("HF_ORGANIZATION")
414
  if hf_organization:
415
  print(f"✅ HF_ORGANIZATION is available: {hf_organization}")
 
419
  if verbose:
420
  print(f"\n===== Testing main default model: {DEFAULT_BENCHMARK_MODEL} =====")
421
 
422
+ # Test the default model
423
  provider = get_available_model_provider(DEFAULT_BENCHMARK_MODEL, verbose=verbose)
424
 
425
  if provider:
 
433
  print(f"\n❌ DEFAULT MODEL FAILED: No provider found for {DEFAULT_BENCHMARK_MODEL}")
434
  print("Trying alternative models...")
435
 
436
+ # Try alternative models
437
  for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
438
  if verbose:
439
  print(f"\nTrying alternative model: {alt_model}")
 
452
  print("\n⚠️ This is likely due to authentication issues with your HF_TOKEN")
453
  print("⚠️ Please check your token or try using models that don't require authentication")
454
 
455
+ # Test all models to get an overview
456
  models = [
457
  "Qwen/QwQ-32B",
458
  "Qwen/Qwen2.5-72B-Instruct",
 
495
  return results
496
 
497
  if __name__ == "__main__":
498
+ # Run test if script is run directly
499
  test_results = test_models(verbose=True)
backend/tests/check_hf_token.py CHANGED
@@ -2,8 +2,8 @@
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
- Script standalone pour vérifier et afficher les propriétés d'un token Hugging Face.
6
- Ce script peut être exécuté séparément pour diagnostiquer les problèmes d'authentification.
7
  """
8
 
9
  import os
@@ -42,16 +42,16 @@ def info(text):
42
 
43
  def check_token_via_inference_api(token=None, verbose=True):
44
  """
45
- Vérifie la validité d'un token HF en testant directement l'API d'inférence.
46
- L'API whoami ne fonctionne pas toujours correctement pour les tokens mais l'API d'inférence
47
- est la priorité dans notre application.
48
 
49
  Args:
50
- token: Le token à vérifier
51
- verbose: Afficher des informations détaillées
52
 
53
  Returns:
54
- dict: Résultats de la vérification
55
  """
56
  results = {
57
  "is_valid": False,
@@ -69,21 +69,21 @@ def check_token_via_inference_api(token=None, verbose=True):
69
  results["error_message"] = "No token provided"
70
  return results
71
 
72
- # Ne montrer aucun caractère du token, juste indiquer sa présence
73
  masked_token = "••••••••••"
74
  results["token"] = masked_token
75
 
76
  print(info(f"Token à vérifier: {masked_token}"))
77
 
78
- # 2. Vérifier le format basique
79
  if not token.startswith("hf_"):
80
  print(warning("Le token ne commence pas par 'hf_' ce qui est inhabituel. Vérifiez son format."))
81
  else:
82
  print(success("Format du token valide (commence par 'hf_')"))
83
 
84
- # 3. Tester l'API d'inférence directement - méthode recommandée pour valider un token
85
  try:
86
- # Test avec un modèle public simple
87
  test_model = "gpt2"
88
  api_url = f"https://api-inference.huggingface.co/models/{test_model}"
89
 
@@ -94,7 +94,7 @@ def check_token_via_inference_api(token=None, verbose=True):
94
 
95
  response = requests.post(api_url, headers=headers, json=payload, timeout=10)
96
 
97
- if response.status_code in [200, 503]: # 503 signifie que le modèle est en cours de chargement, mais le token est valide
98
  print(success(f"Token valide pour l'API d'inférence! Status code: {response.status_code}"))
99
  if response.status_code == 503:
100
  print(info("Le modèle est en cours de chargement. Le token a bien été accepté par l'API."))
@@ -116,7 +116,7 @@ def check_token_via_inference_api(token=None, verbose=True):
116
  except:
117
  print(error(f"Message d'erreur: {response.text}"))
118
 
119
- # En cas d'échec, tester aussi l'endpoint de liste des modèles
120
  try:
121
  print(info("Test alternatif avec la liste des modèles déployés..."))
122
  list_url = "https://api-inference.huggingface.co/status"
@@ -135,17 +135,17 @@ def check_token_via_inference_api(token=None, verbose=True):
135
  print(error(f"Erreur lors du test de l'API d'inférence: {str(e)}"))
136
  results["error_message"] = str(e)
137
 
138
- # 4. Tests supplémentaires des permissions
139
  if results["is_valid"]:
140
  try:
141
  print(info("\nTest des permissions du token..."))
142
 
143
- # Tester si on peut accéder aux modèles privés de l'organisation
144
  if os.environ.get("HF_ORGANIZATION"):
145
  org = os.environ.get("HF_ORGANIZATION")
146
  print(info(f"Test d'accès aux modèles de l'organisation {org}..."))
147
 
148
- # On regarde juste si on peut accéder à la liste des modèles de l'organisation
149
  org_url = f"https://huggingface.co/api/models?author={org}"
150
  org_response = requests.get(org_url, headers=headers, timeout=10)
151
 
@@ -161,15 +161,15 @@ def check_token_via_inference_api(token=None, verbose=True):
161
 
162
  def check_model_access(token, model, verbose=False):
163
  """
164
- Vérifie si le token a accès à un modèle spécifique.
165
 
166
  Args:
167
- token: Token HF à vérifier
168
- model: Nom du modèle à tester
169
- verbose: Afficher des informations détaillées
170
 
171
  Returns:
172
- bool: True si le modèle est accessible, False sinon
173
  """
174
  print(f"\n" + info(f"Test d'accès au modèle: {model}"))
175
 
@@ -177,7 +177,7 @@ def check_model_access(token, model, verbose=False):
177
  "Authorization": f"Bearer {token}"
178
  }
179
 
180
- # 1. Vérifier si le modèle existe et est accessible via l'API d'inférence
181
  try:
182
  api_url = f"https://api-inference.huggingface.co/models/{model}"
183
  payload = {"inputs": "Hello, test access"}
@@ -186,7 +186,7 @@ def check_model_access(token, model, verbose=False):
186
 
187
  response = requests.post(api_url, headers=headers, json=payload, timeout=20)
188
 
189
- if response.status_code in [200, 503]: # 503 = modèle en cours de chargement, mais le token est valide
190
  if response.status_code == 200:
191
  print(success(f"Accès réussi à l'API d'inférence pour {model}"))
192
  return True
@@ -210,7 +210,7 @@ def check_model_access(token, model, verbose=False):
210
  print(warning("Possible problème de quota ou de limite de taux"))
211
  elif "loading" in error_message.lower():
212
  print(info("Le modèle est en cours de chargement - réessayez plus tard"))
213
- return True # Considérer comme un succès car le token est accepté
214
  elif "permission" in error_message.lower() or "access" in error_message.lower():
215
  print(error("Problème de permissions - vous n'avez pas accès à ce modèle"))
216
 
@@ -250,16 +250,16 @@ def main():
250
 
251
  args = parser.parse_args()
252
 
253
- # Charger les variables d'environnement
254
  load_dotenv()
255
 
256
  print(info(f"=== Vérification de Token Hugging Face - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n"))
257
 
258
- # Vérifier le token via l'API d'inférence directement
259
  token = args.token or os.environ.get("HF_TOKEN")
260
  token_info = check_token_via_inference_api(token, args.verbose)
261
 
262
- # Si le token est valide et qu'on a demandé de tester un modèle
263
  if token_info["is_valid"]:
264
  if args.test_model:
265
  check_model_access(token, args.test_model, args.verbose)
 
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
+ Standalone script to check and display Hugging Face token properties.
6
+ This script can be run separately to diagnose authentication issues.
7
  """
8
 
9
  import os
 
42
 
43
  def check_token_via_inference_api(token=None, verbose=True):
44
  """
45
+ Check the validity of an HF token by directly testing the inference API.
46
+ The whoami API doesn't always work correctly for tokens but the inference API
47
+ is the priority in our application.
48
 
49
  Args:
50
+ token: The token to check
51
+ verbose: Display detailed information
52
 
53
  Returns:
54
+ dict: Check results
55
  """
56
  results = {
57
  "is_valid": False,
 
69
  results["error_message"] = "No token provided"
70
  return results
71
 
72
+ # Don't show any token characters, just indicate its presence
73
  masked_token = "••••••••••"
74
  results["token"] = masked_token
75
 
76
  print(info(f"Token à vérifier: {masked_token}"))
77
 
78
+ # 2. Check basic format
79
  if not token.startswith("hf_"):
80
  print(warning("Le token ne commence pas par 'hf_' ce qui est inhabituel. Vérifiez son format."))
81
  else:
82
  print(success("Format du token valide (commence par 'hf_')"))
83
 
84
+ # 3. Test inference API directly - recommended method to validate a token
85
  try:
86
+ # Test with a simple public model
87
  test_model = "gpt2"
88
  api_url = f"https://api-inference.huggingface.co/models/{test_model}"
89
 
 
94
 
95
  response = requests.post(api_url, headers=headers, json=payload, timeout=10)
96
 
97
+ if response.status_code in [200, 503]: # 503 means the model is loading, but the token is valid
98
  print(success(f"Token valide pour l'API d'inférence! Status code: {response.status_code}"))
99
  if response.status_code == 503:
100
  print(info("Le modèle est en cours de chargement. Le token a bien été accepté par l'API."))
 
116
  except:
117
  print(error(f"Message d'erreur: {response.text}"))
118
 
119
+ # In case of failure, also test the model list endpoint
120
  try:
121
  print(info("Test alternatif avec la liste des modèles déployés..."))
122
  list_url = "https://api-inference.huggingface.co/status"
 
135
  print(error(f"Erreur lors du test de l'API d'inférence: {str(e)}"))
136
  results["error_message"] = str(e)
137
 
138
+ # 4. Additional permission tests
139
  if results["is_valid"]:
140
  try:
141
  print(info("\nTest des permissions du token..."))
142
 
143
+ # Test if we can access organization's private models
144
  if os.environ.get("HF_ORGANIZATION"):
145
  org = os.environ.get("HF_ORGANIZATION")
146
  print(info(f"Test d'accès aux modèles de l'organisation {org}..."))
147
 
148
+ # Just check if we can access the organization's model list
149
  org_url = f"https://huggingface.co/api/models?author={org}"
150
  org_response = requests.get(org_url, headers=headers, timeout=10)
151
 
 
161
 
162
  def check_model_access(token, model, verbose=False):
163
  """
164
+ Check if the token has access to a specific model.
165
 
166
  Args:
167
+ token: HF token to check
168
+ model: Name of the model to test
169
+ verbose: Display detailed information
170
 
171
  Returns:
172
+ bool: True if model is accessible, False otherwise
173
  """
174
  print(f"\n" + info(f"Test d'accès au modèle: {model}"))
175
 
 
177
  "Authorization": f"Bearer {token}"
178
  }
179
 
180
+ # 1. Check if the model exists and is accessible via inference API
181
  try:
182
  api_url = f"https://api-inference.huggingface.co/models/{model}"
183
  payload = {"inputs": "Hello, test access"}
 
186
 
187
  response = requests.post(api_url, headers=headers, json=payload, timeout=20)
188
 
189
+ if response.status_code in [200, 503]: # 503 = model is loading, but token is valid
190
  if response.status_code == 200:
191
  print(success(f"Accès réussi à l'API d'inférence pour {model}"))
192
  return True
 
210
  print(warning("Possible problème de quota ou de limite de taux"))
211
  elif "loading" in error_message.lower():
212
  print(info("Le modèle est en cours de chargement - réessayez plus tard"))
213
+ return True # Consider as success because token is accepted
214
  elif "permission" in error_message.lower() or "access" in error_message.lower():
215
  print(error("Problème de permissions - vous n'avez pas accès à ce modèle"))
216
 
 
250
 
251
  args = parser.parse_args()
252
 
253
+ # Load environment variables
254
  load_dotenv()
255
 
256
  print(info(f"=== Vérification de Token Hugging Face - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n"))
257
 
258
+ # Check token directly via inference API
259
  token = args.token or os.environ.get("HF_TOKEN")
260
  token_info = check_token_via_inference_api(token, args.verbose)
261
 
262
+ # If token is valid and we were asked to test a model
263
  if token_info["is_valid"]:
264
  if args.test_model:
265
  check_model_access(token, args.test_model, args.verbose)
frontend/src/components/Evaluation/Display.jsx CHANGED
@@ -20,7 +20,7 @@ import OpenInNewIcon from "@mui/icons-material/OpenInNew";
20
  import CheckCircleIcon from "@mui/icons-material/CheckCircle";
21
  import ErrorDisplay from "../common/ErrorDisplay";
22
 
23
- // Styles pour les médailles
24
  const MEDAL_STYLES = {
25
  1: {
26
  color: "#B58A1B",
@@ -48,7 +48,7 @@ const MEDAL_STYLES = {
48
  },
49
  };
50
 
51
- // Fonction pour obtenir le style de médaille en fonction du rang
52
  const getMedalStyle = (rank) => {
53
  if (rank <= 3) {
54
  const medalStyle = MEDAL_STYLES[rank];
@@ -72,7 +72,7 @@ const getMedalStyle = (rank) => {
72
  marginRight: "8px",
73
  };
74
  }
75
- // Pour les rangs > 3, même dimensions mais transparent
76
  return {
77
  color: "text.primary",
78
  fontWeight: rank <= 10 ? 600 : 400,
 
20
  import CheckCircleIcon from "@mui/icons-material/CheckCircle";
21
  import ErrorDisplay from "../common/ErrorDisplay";
22
 
23
+ // Styles for medals
24
  const MEDAL_STYLES = {
25
  1: {
26
  color: "#B58A1B",
 
48
  },
49
  };
50
 
51
+ // Function to get medal style based on rank
52
  const getMedalStyle = (rank) => {
53
  if (rank <= 3) {
54
  const medalStyle = MEDAL_STYLES[rank];
 
72
  marginRight: "8px",
73
  };
74
  }
75
+ // For ranks > 3, same dimensions but transparent
76
  return {
77
  color: "text.primary",
78
  fontWeight: rank <= 10 ? 600 : 400,
frontend/src/components/Evaluation/hooks/useSimulation.js CHANGED
@@ -1,8 +1,8 @@
1
  import { useState, useRef, useEffect } from "react";
2
 
3
  // Simulation time in milliseconds for pre-calculated documents
4
- const SIMULATION_DURATION = 8000; // 8 secondes au total
5
- const STEP_DURATION = SIMULATION_DURATION / 5; // Durée de chaque étape
6
 
7
  // Starting messages with their timing
8
  const STARTING_MESSAGES = [
@@ -19,15 +19,15 @@ export const useSimulation = (onComplete, shouldStart = false) => {
19
  const timeoutsRef = useRef([]);
20
  const hasInitializedRef = useRef(false);
21
 
22
- // Effet pour démarrer la simulation si shouldStart est true
23
  useEffect(() => {
24
  if (!shouldStart || hasInitializedRef.current) return;
25
 
26
- // Marquer comme initialisé
27
  hasInitializedRef.current = true;
28
  console.log("Simulation starting with shouldStart =", shouldStart);
29
 
30
- // Programmer des timeouts séquentiels pour chaque étape
31
  for (let i = 1; i < STARTING_MESSAGES.length; i++) {
32
  const timeout = setTimeout(() => {
33
  console.log(`Setting message index to ${i}`);
@@ -49,7 +49,7 @@ export const useSimulation = (onComplete, shouldStart = false) => {
49
  timeoutsRef.current.push(completeTimeout);
50
 
51
  return () => {
52
- // Nettoyer tous les timeouts lors du démontage
53
  timeoutsRef.current.forEach(clearTimeout);
54
  };
55
  }, [shouldStart, onComplete]);
 
1
  import { useState, useRef, useEffect } from "react";
2
 
3
  // Simulation time in milliseconds for pre-calculated documents
4
+ const SIMULATION_DURATION = 7000; // Duration in milliseconds
5
+ const STEP_DURATION = SIMULATION_DURATION / 5; // Duration of each step
6
 
7
  // Starting messages with their timing
8
  const STARTING_MESSAGES = [
 
19
  const timeoutsRef = useRef([]);
20
  const hasInitializedRef = useRef(false);
21
 
22
+ // Effect to start simulation if shouldStart is true
23
  useEffect(() => {
24
  if (!shouldStart || hasInitializedRef.current) return;
25
 
26
+ // Mark as initialized
27
  hasInitializedRef.current = true;
28
  console.log("Simulation starting with shouldStart =", shouldStart);
29
 
30
+ // Schedule sequential timeouts for each step
31
  for (let i = 1; i < STARTING_MESSAGES.length; i++) {
32
  const timeout = setTimeout(() => {
33
  console.log(`Setting message index to ${i}`);
 
49
  timeoutsRef.current.push(completeTimeout);
50
 
51
  return () => {
52
+ // Clean up all timeouts on unmount
53
  timeoutsRef.current.forEach(clearTimeout);
54
  };
55
  }, [shouldStart, onComplete]);
frontend/src/hooks/useDevShortcuts.js CHANGED
@@ -2,10 +2,10 @@ import { useEffect } from "react";
2
  import { useNavigate } from "react-router-dom";
3
 
4
  /**
5
- * Hook pour les raccourcis clavier du mode développeur
6
  *
7
- * @param {Object} options - Options pour le hook
8
- * @param {string} options.sessionId - ID de session en cours (si disponible)
9
  * @returns {void}
10
  */
11
  const useDevShortcuts = ({ sessionId = null } = {}) => {
@@ -13,18 +13,18 @@ const useDevShortcuts = ({ sessionId = null } = {}) => {
13
 
14
  useEffect(() => {
15
  const handleKeyDown = (e) => {
16
- // Raccourci 'p' - effacer les données d'authentification et recharger
17
  if (e.key === "p") {
18
  console.log("Debug key pressed: Clearing auth data and refreshing");
19
  localStorage.removeItem("hf_oauth");
20
  localStorage.removeItem("auth_return_to");
21
- // Afficher un bref message
22
  alert("Auth data cleared. Page will reload.");
23
- // Recharger la page
24
  window.location.reload();
25
  }
26
 
27
- // Raccourci 'd' - aller directement à l'affichage du benchmark
28
  if (e.key === "d" && sessionId) {
29
  console.log("Debug key pressed: Showing BenchmarkDisplay");
30
  navigate(`/benchmark-display?session=${sessionId}`);
 
2
  import { useNavigate } from "react-router-dom";
3
 
4
  /**
5
+ * Hook for developer keyboard shortcuts
6
  *
7
+ * @param {Object} options - Options for the hook
8
+ * @param {string} options.sessionId - Current session ID (if available)
9
  * @returns {void}
10
  */
11
  const useDevShortcuts = ({ sessionId = null } = {}) => {
 
13
 
14
  useEffect(() => {
15
  const handleKeyDown = (e) => {
16
+ // Shortcut 'p' - clear authentication data and reload
17
  if (e.key === "p") {
18
  console.log("Debug key pressed: Clearing auth data and refreshing");
19
  localStorage.removeItem("hf_oauth");
20
  localStorage.removeItem("auth_return_to");
21
+ // Show a brief message
22
  alert("Auth data cleared. Page will reload.");
23
+ // Reload the page
24
  window.location.reload();
25
  }
26
 
27
+ // Shortcut 'd' - go directly to benchmark display
28
  if (e.key === "d" && sessionId) {
29
  console.log("Debug key pressed: Showing BenchmarkDisplay");
30
  navigate(`/benchmark-display?session=${sessionId}`);
frontend/src/pages/BenchmarkGenerationPage.jsx CHANGED
@@ -21,8 +21,8 @@ function BenchmarkGenerationPage() {
21
  const handleGenerationComplete = (result) => {
22
  console.log("Benchmark generation completed:", result);
23
  if (result && result.success && !hasRedirectedRef.current) {
24
- hasRedirectedRef.current = true; // Marquer que la redirection a été faite
25
- // Légère pause avant de naviguer pour éviter les problèmes de synchronisation
26
  setTimeout(() => {
27
  navigate(`/benchmark-display?session=${sessionId}`);
28
  }, 500);
 
21
  const handleGenerationComplete = (result) => {
22
  console.log("Benchmark generation completed:", result);
23
  if (result && result.success && !hasRedirectedRef.current) {
24
+ hasRedirectedRef.current = true; // Mark that redirection has been done
25
+ // Short pause before navigating to avoid synchronization issues
26
  setTimeout(() => {
27
  navigate(`/benchmark-display?session=${sessionId}`);
28
  }, 500);
frontend/src/pages/EvaluationDisplayPage.jsx CHANGED
@@ -18,14 +18,14 @@ function EvaluationDisplayPage() {
18
  const { mode } = useThemeMode();
19
  const theme = getTheme(mode);
20
 
21
- // Liste des documents de base qui ne doivent pas être supprimés
22
  const baseDocuments = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"];
23
  const isBaseDocument = baseDocuments.includes(sessionId);
24
 
25
  useEffect(() => {
26
  if (!sessionId) {
27
  console.log(
28
- "Session ID manquante pour l'affichage des résultats, redirection vers l'accueil"
29
  );
30
  setIsValidSession(false);
31
  return;
@@ -33,20 +33,20 @@ function EvaluationDisplayPage() {
33
 
34
  const fetchEvaluationResults = async () => {
35
  try {
36
- // Vérifier d'abord si la session existe
37
  const sessionCheckResponse = await fetch(
38
  `${API_CONFIG.BASE_URL}/benchmark-questions/${sessionId}`
39
  );
40
 
41
  if (!sessionCheckResponse.ok) {
42
  console.error(
43
- `Session invalide ou erreur serveur: ${sessionCheckResponse.status}`
44
  );
45
  setIsValidSession(false);
46
  return;
47
  }
48
 
49
- // Récupérer les résultats d'évaluation
50
  const evalResponse = await fetch(
51
  `${API_CONFIG.BASE_URL}/evaluation-results/${sessionId}`
52
  );
@@ -77,14 +77,14 @@ function EvaluationDisplayPage() {
77
  fetchEvaluationResults();
78
  }, [sessionId]);
79
 
80
- // Effet pour nettoyer le dossier de session après avoir affiché les résultats
81
  useEffect(() => {
82
- // Ne pas nettoyer si c'est un document de base ou si les résultats ne sont pas encore chargés
83
  if (isBaseDocument || isLoading || !evaluationResults) {
84
  return;
85
  }
86
 
87
- // Fonction pour supprimer le dossier de session
88
  const cleanupSession = async () => {
89
  try {
90
  const response = await fetch(
@@ -104,12 +104,12 @@ function EvaluationDisplayPage() {
104
  }
105
  };
106
 
107
- // Appeler la fonction après un délai pour s'assurer que l'utilisateur a eu le temps de voir les résultats
108
  const cleanupTimeout = setTimeout(() => {
109
  cleanupSession();
110
  }, 2000);
111
 
112
- // Nettoyer le timeout si le composant est démonté
113
  return () => clearTimeout(cleanupTimeout);
114
  }, [sessionId, isBaseDocument, isLoading, evaluationResults]);
115
 
 
18
  const { mode } = useThemeMode();
19
  const theme = getTheme(mode);
20
 
21
+ // List of base documents that should not be deleted
22
  const baseDocuments = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"];
23
  const isBaseDocument = baseDocuments.includes(sessionId);
24
 
25
  useEffect(() => {
26
  if (!sessionId) {
27
  console.log(
28
+ "Session ID missing for displaying results, redirecting to home"
29
  );
30
  setIsValidSession(false);
31
  return;
 
33
 
34
  const fetchEvaluationResults = async () => {
35
  try {
36
+ // First check if the session exists
37
  const sessionCheckResponse = await fetch(
38
  `${API_CONFIG.BASE_URL}/benchmark-questions/${sessionId}`
39
  );
40
 
41
  if (!sessionCheckResponse.ok) {
42
  console.error(
43
+ `Invalid session or server error: ${sessionCheckResponse.status}`
44
  );
45
  setIsValidSession(false);
46
  return;
47
  }
48
 
49
+ // Retrieve evaluation results
50
  const evalResponse = await fetch(
51
  `${API_CONFIG.BASE_URL}/evaluation-results/${sessionId}`
52
  );
 
77
  fetchEvaluationResults();
78
  }, [sessionId]);
79
 
80
+ // Effect to clean up the session folder after displaying results
81
  useEffect(() => {
82
+ // Do not clean up if it's a base document or if results are not yet loaded
83
  if (isBaseDocument || isLoading || !evaluationResults) {
84
  return;
85
  }
86
 
87
+ // Function to clean up the session folder
88
  const cleanupSession = async () => {
89
  try {
90
  const response = await fetch(
 
104
  }
105
  };
106
 
107
+ // Call the function after a delay to ensure the user had time to see the results
108
  const cleanupTimeout = setTimeout(() => {
109
  cleanupSession();
110
  }, 2000);
111
 
112
+ // Clean up the timeout if component is unmounted
113
  return () => clearTimeout(cleanupTimeout);
114
  }, [sessionId, isBaseDocument, isLoading, evaluationResults]);
115