Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
translate some comments in english
Browse files- backend/clean_and_restart_eval.py +10 -10
- backend/config/models_config.py +2 -2
- backend/lighteval_task/lighteval_task.py +40 -39
- backend/main.py +7 -7
- backend/routes/__init__.py +3 -3
- backend/routes/benchmark.py +15 -15
- backend/routes/evaluation.py +19 -19
- backend/routes/upload.py +11 -11
- backend/tasks/create_bench.py +17 -19
- backend/tasks/create_bench_config_file.py +12 -12
- backend/tasks/get_available_model_provider.py +42 -42
- backend/tests/check_hf_token.py +28 -28
- frontend/src/components/Evaluation/Display.jsx +3 -3
- frontend/src/components/Evaluation/hooks/useSimulation.js +6 -6
- frontend/src/hooks/useDevShortcuts.js +7 -7
- frontend/src/pages/BenchmarkGenerationPage.jsx +2 -2
- frontend/src/pages/EvaluationDisplayPage.jsx +10 -10
backend/clean_and_restart_eval.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Script
|
4 |
"""
|
5 |
import os
|
6 |
import sys
|
@@ -9,7 +9,7 @@ import asyncio
|
|
9 |
from pathlib import Path
|
10 |
from datetime import datetime
|
11 |
|
12 |
-
#
|
13 |
from tasks.evaluation_task import EvaluationTask, DEFAULT_EVALUATION_TIMEOUT
|
14 |
|
15 |
|
@@ -27,33 +27,33 @@ async def main(session_id, dataset_name, timeout=None):
|
|
27 |
dataset_name: Nom du dataset à évaluer
|
28 |
timeout: Timeout en secondes pour chaque évaluation de modèle (utilise la valeur par défaut si None)
|
29 |
"""
|
30 |
-
#
|
31 |
session_dir = Path(f"uploaded_files/{session_id}")
|
32 |
if not session_dir.exists():
|
33 |
log(f"Erreur: Le dossier de session {session_id} n'existe pas")
|
34 |
return 1
|
35 |
|
36 |
-
#
|
37 |
timeout_value = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
38 |
log(f"Utilisation d'un timeout de {timeout_value} secondes pour l'évaluation")
|
39 |
|
40 |
-
#
|
41 |
log("Initialisation d'une nouvelle tâche d'évaluation")
|
42 |
evaluation_task = EvaluationTask(
|
43 |
session_uid=session_id,
|
44 |
dataset_name=dataset_name,
|
45 |
-
clean_old_results=True, #
|
46 |
timeout=timeout
|
47 |
)
|
48 |
|
49 |
-
#
|
50 |
log("Démarrage de l'évaluation...")
|
51 |
await evaluation_task.run()
|
52 |
|
53 |
-
#
|
54 |
if evaluation_task.is_completed:
|
55 |
log("Évaluation terminée avec succès")
|
56 |
-
#
|
57 |
results_sorted = sorted(evaluation_task.results, key=lambda x: x.get('accuracy', 0), reverse=True)
|
58 |
log(f"Résultats: {results_sorted}")
|
59 |
else:
|
@@ -71,6 +71,6 @@ if __name__ == "__main__":
|
|
71 |
|
72 |
args = parser.parse_args()
|
73 |
|
74 |
-
#
|
75 |
exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.timeout))
|
76 |
sys.exit(exit_code)
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Script for relaunching LightEval evaluation with a custom timeout
|
4 |
"""
|
5 |
import os
|
6 |
import sys
|
|
|
9 |
from pathlib import Path
|
10 |
from datetime import datetime
|
11 |
|
12 |
+
# Import evaluation task
|
13 |
from tasks.evaluation_task import EvaluationTask, DEFAULT_EVALUATION_TIMEOUT
|
14 |
|
15 |
|
|
|
27 |
dataset_name: Nom du dataset à évaluer
|
28 |
timeout: Timeout en secondes pour chaque évaluation de modèle (utilise la valeur par défaut si None)
|
29 |
"""
|
30 |
+
# Check that session folder exists
|
31 |
session_dir = Path(f"uploaded_files/{session_id}")
|
32 |
if not session_dir.exists():
|
33 |
log(f"Erreur: Le dossier de session {session_id} n'existe pas")
|
34 |
return 1
|
35 |
|
36 |
+
# Display used timeout
|
37 |
timeout_value = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
38 |
log(f"Utilisation d'un timeout de {timeout_value} secondes pour l'évaluation")
|
39 |
|
40 |
+
# Create new evaluation task with specified timeout
|
41 |
log("Initialisation d'une nouvelle tâche d'évaluation")
|
42 |
evaluation_task = EvaluationTask(
|
43 |
session_uid=session_id,
|
44 |
dataset_name=dataset_name,
|
45 |
+
clean_old_results=True, # Automatically clean old results
|
46 |
timeout=timeout
|
47 |
)
|
48 |
|
49 |
+
# Run evaluation
|
50 |
log("Démarrage de l'évaluation...")
|
51 |
await evaluation_task.run()
|
52 |
|
53 |
+
# Check results
|
54 |
if evaluation_task.is_completed:
|
55 |
log("Évaluation terminée avec succès")
|
56 |
+
# Sort results by accuracy
|
57 |
results_sorted = sorted(evaluation_task.results, key=lambda x: x.get('accuracy', 0), reverse=True)
|
58 |
log(f"Résultats: {results_sorted}")
|
59 |
else:
|
|
|
71 |
|
72 |
args = parser.parse_args()
|
73 |
|
74 |
+
# Run main function asynchronously
|
75 |
exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.timeout))
|
76 |
sys.exit(exit_code)
|
backend/config/models_config.py
CHANGED
@@ -29,14 +29,14 @@ DEFAULT_EVALUATION_MODELS = [
|
|
29 |
# "mistralai/Mistral-Small-24B-Instruct-2501",
|
30 |
# ]
|
31 |
|
32 |
-
#
|
33 |
ALTERNATIVE_BENCHMARK_MODELS = [
|
34 |
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
35 |
"meta-llama/Llama-3.3-70B-Instruct",
|
36 |
"meta-llama/Llama-3.1-8B-Instruct",
|
37 |
"Qwen/Qwen2.5-72B-Instruct",
|
38 |
"mistralai/Mistral-Small-24B-Instruct-2501",
|
39 |
-
#
|
40 |
"HuggingFaceH4/zephyr-7b-beta",
|
41 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
42 |
"microsoft/phi-2",
|
|
|
29 |
# "mistralai/Mistral-Small-24B-Instruct-2501",
|
30 |
# ]
|
31 |
|
32 |
+
# Alternative models to use if default model is not available
|
33 |
ALTERNATIVE_BENCHMARK_MODELS = [
|
34 |
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
35 |
"meta-llama/Llama-3.3-70B-Instruct",
|
36 |
"meta-llama/Llama-3.1-8B-Instruct",
|
37 |
"Qwen/Qwen2.5-72B-Instruct",
|
38 |
"mistralai/Mistral-Small-24B-Instruct-2501",
|
39 |
+
# Open-source models that can work without authentication
|
40 |
"HuggingFaceH4/zephyr-7b-beta",
|
41 |
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
42 |
"microsoft/phi-2",
|
backend/lighteval_task/lighteval_task.py
CHANGED
@@ -143,49 +143,49 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
143 |
|
144 |
|
145 |
def process_judge_response_yourbench(response):
|
146 |
-
#
|
147 |
-
# logger.info(f"
|
148 |
|
149 |
-
#
|
150 |
if isinstance(response, dict):
|
151 |
-
# logger.info(f"
|
152 |
if "content" in response:
|
153 |
response = response["content"]
|
154 |
-
# logger.info(f"
|
155 |
elif "text" in response:
|
156 |
response = response["text"]
|
157 |
-
# logger.info(f"
|
158 |
elif "response" in response:
|
159 |
response = response["response"]
|
160 |
-
# logger.info(f"
|
161 |
else:
|
162 |
-
#
|
163 |
response = str(list(response.values())[0])
|
164 |
-
# logger.info(f"
|
165 |
|
166 |
-
#
|
167 |
if isinstance(response, list):
|
168 |
-
# logger.info(f"
|
169 |
if len(response) > 0:
|
170 |
if isinstance(response[0], dict) and "content" in response[0]:
|
171 |
response = response[0]["content"]
|
172 |
-
# logger.info(f"
|
173 |
else:
|
174 |
response = str(response[0])
|
175 |
-
# logger.info(f"
|
176 |
|
177 |
-
#
|
178 |
-
# logger.info(f"
|
179 |
|
180 |
-
#
|
181 |
try:
|
182 |
-
#
|
183 |
-
#
|
184 |
|
185 |
-
#
|
186 |
response_str = str(response).lower()
|
187 |
|
188 |
-
#
|
189 |
negative_patterns = [
|
190 |
r"\bincorrect\b",
|
191 |
r"\bwrong\b",
|
@@ -198,21 +198,22 @@ def process_judge_response_yourbench(response):
|
|
198 |
r"\b0\b"
|
199 |
]
|
200 |
|
201 |
-
#
|
202 |
for pattern in negative_patterns:
|
203 |
if re.search(pattern, response_str):
|
204 |
-
# logger.info(f"
|
205 |
return 0
|
206 |
|
207 |
-
#
|
208 |
-
# logger.info("
|
209 |
return 1
|
210 |
|
211 |
except Exception as e:
|
|
|
212 |
# logger.error(f"Error processing judge response: {e}")
|
213 |
# logger.error(f"Response type: {type(response)}")
|
214 |
# logger.error(f"Response content (truncated): {str(response)[:500]}")
|
215 |
-
return 0 #
|
216 |
|
217 |
|
218 |
class JudgeLLMYourBench(JudgeLLM):
|
@@ -226,7 +227,7 @@ class JudgeLLMYourBench(JudgeLLM):
|
|
226 |
)
|
227 |
|
228 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
229 |
-
#
|
230 |
# logger.info(f"Nombre de sample_ids: {len(sample_ids)}")
|
231 |
# logger.info(f"Nombre de responses: {len(responses)}")
|
232 |
# logger.info(f"Nombre de formatted_docs: {len(formatted_docs)}")
|
@@ -244,37 +245,37 @@ class JudgeLLMYourBench(JudgeLLM):
|
|
244 |
if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0:
|
245 |
chunks.append(doc.specific["chunks"][0])
|
246 |
else:
|
247 |
-
#
|
248 |
chunks.append("")
|
249 |
|
250 |
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
251 |
|
252 |
-
#
|
253 |
# logger.info(f"Questions: {questions}")
|
254 |
# logger.info(f"Predictions: {predictions}")
|
255 |
# logger.info(f"Golds: {golds}")
|
256 |
|
257 |
-
#
|
258 |
-
#
|
259 |
-
#
|
260 |
scores = []
|
261 |
for i in range(len(questions)):
|
262 |
prediction = str(predictions[i]).lower()
|
263 |
gold = str(golds[i]).lower()
|
264 |
|
265 |
-
#
|
266 |
key_terms = [word for word in gold.split() if len(word) > 4]
|
267 |
|
268 |
-
#
|
269 |
matches = sum(1 for term in key_terms if term in prediction)
|
270 |
coverage = matches / len(key_terms) if key_terms else 0
|
271 |
|
272 |
-
#
|
273 |
# C'est moins strict que les 60% initiaux, mais plus strict que 0%
|
274 |
score = 1.0 if coverage >= 0.4 else 0.0
|
275 |
|
276 |
-
# logger.info(f"
|
277 |
-
# logger.info(f"
|
278 |
|
279 |
scores.append(score)
|
280 |
|
@@ -292,9 +293,9 @@ class JudgeLLMYourBench(JudgeLLM):
|
|
292 |
|
293 |
except Exception as e:
|
294 |
# logger.error(f"Erreur dans la fonction compute: {str(e)}")
|
295 |
-
# logger.exception("
|
296 |
|
297 |
-
#
|
298 |
return [{"accuracy": 0.0} for _ in sample_ids]
|
299 |
|
300 |
|
@@ -350,7 +351,7 @@ def create_yourbench_task(hf_dataset_name, subset="lighteval_single_shot_questio
|
|
350 |
try:
|
351 |
extend_enum(Metrics, "accuracy", yourbench_metrics)
|
352 |
except Exception:
|
353 |
-
#
|
354 |
pass
|
355 |
|
356 |
return LightevalTaskConfig(
|
|
|
143 |
|
144 |
|
145 |
def process_judge_response_yourbench(response):
|
146 |
+
# Add detailed logs to understand response structure
|
147 |
+
# logger.info(f"Response type: {type(response)}")
|
148 |
|
149 |
+
# If response is a dictionary, extract content
|
150 |
if isinstance(response, dict):
|
151 |
+
# logger.info(f"Dictionary keys: {response.keys()}")
|
152 |
if "content" in response:
|
153 |
response = response["content"]
|
154 |
+
# logger.info(f"Content of 'content' key: {response[:100]}...")
|
155 |
elif "text" in response:
|
156 |
response = response["text"]
|
157 |
+
# logger.info(f"Content of 'text' key: {response[:100]}...")
|
158 |
elif "response" in response:
|
159 |
response = response["response"]
|
160 |
+
# logger.info(f"Content of 'response' key: {response[:100]}...")
|
161 |
else:
|
162 |
+
# If no text field is found, take the first value
|
163 |
response = str(list(response.values())[0])
|
164 |
+
# logger.info(f"Using first value: {response[:100]}...")
|
165 |
|
166 |
+
# If response is a list, take first element
|
167 |
if isinstance(response, list):
|
168 |
+
# logger.info(f"Response is a list of length {len(response)}")
|
169 |
if len(response) > 0:
|
170 |
if isinstance(response[0], dict) and "content" in response[0]:
|
171 |
response = response[0]["content"]
|
172 |
+
# logger.info(f"Using content of first element: {response[:100]}...")
|
173 |
else:
|
174 |
response = str(response[0])
|
175 |
+
# logger.info(f"Using first element (converted to string): {response[:100]}...")
|
176 |
|
177 |
+
# For debugging, log current response
|
178 |
+
# logger.info(f"Response after initial processing: {str(response)[:200]}...")
|
179 |
|
180 |
+
# Simplified approach: if we have a response, we'll analyze it to determine 0 or 1
|
181 |
try:
|
182 |
+
# For simplicity, use an approach based on keyword matching
|
183 |
+
# always consider the response correct unless it contains clear negative indications
|
184 |
|
185 |
+
# Convert to string to be sure
|
186 |
response_str = str(response).lower()
|
187 |
|
188 |
+
# Strong negative expressions
|
189 |
negative_patterns = [
|
190 |
r"\bincorrect\b",
|
191 |
r"\bwrong\b",
|
|
|
198 |
r"\b0\b"
|
199 |
]
|
200 |
|
201 |
+
# Check if there are negative patterns
|
202 |
for pattern in negative_patterns:
|
203 |
if re.search(pattern, response_str):
|
204 |
+
# logger.info(f"Negative pattern found: {pattern} in response")
|
205 |
return 0
|
206 |
|
207 |
+
# If we haven't found a negative pattern, consider the response correct
|
208 |
+
# logger.info("No negative pattern found, response considered correct")
|
209 |
return 1
|
210 |
|
211 |
except Exception as e:
|
212 |
+
# logger.exception("Error details:")
|
213 |
# logger.error(f"Error processing judge response: {e}")
|
214 |
# logger.error(f"Response type: {type(response)}")
|
215 |
# logger.error(f"Response content (truncated): {str(response)[:500]}")
|
216 |
+
return 0 # Return 0 by default in case of error
|
217 |
|
218 |
|
219 |
class JudgeLLMYourBench(JudgeLLM):
|
|
|
227 |
)
|
228 |
|
229 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
230 |
+
# Add debugging to see complete data structure
|
231 |
# logger.info(f"Nombre de sample_ids: {len(sample_ids)}")
|
232 |
# logger.info(f"Nombre de responses: {len(responses)}")
|
233 |
# logger.info(f"Nombre de formatted_docs: {len(formatted_docs)}")
|
|
|
245 |
if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0:
|
246 |
chunks.append(doc.specific["chunks"][0])
|
247 |
else:
|
248 |
+
# Use default value when chunks is absent or empty
|
249 |
chunks.append("")
|
250 |
|
251 |
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
252 |
|
253 |
+
# Add logs for debugging
|
254 |
# logger.info(f"Questions: {questions}")
|
255 |
# logger.info(f"Predictions: {predictions}")
|
256 |
# logger.info(f"Golds: {golds}")
|
257 |
|
258 |
+
# Instead of using the judge, which seems to have issues,
|
259 |
+
# Use a simplified approach based on the presence of key elements
|
260 |
+
# from the reference response in the model's response
|
261 |
scores = []
|
262 |
for i in range(len(questions)):
|
263 |
prediction = str(predictions[i]).lower()
|
264 |
gold = str(golds[i]).lower()
|
265 |
|
266 |
+
# Extract keywords from reference response (words longer than 4 letters)
|
267 |
key_terms = [word for word in gold.split() if len(word) > 4]
|
268 |
|
269 |
+
# Calculate proportion of keywords present in model response
|
270 |
matches = sum(1 for term in key_terms if term in prediction)
|
271 |
coverage = matches / len(key_terms) if key_terms else 0
|
272 |
|
273 |
+
# Consider response correct if it covers at least 40% of keywords
|
274 |
# C'est moins strict que les 60% initiaux, mais plus strict que 0%
|
275 |
score = 1.0 if coverage >= 0.4 else 0.0
|
276 |
|
277 |
+
# logger.info(f"Keyword coverage for question {i+1}: {coverage:.2f} ({matches}/{len(key_terms)})")
|
278 |
+
# logger.info(f"Assigned score: {score}")
|
279 |
|
280 |
scores.append(score)
|
281 |
|
|
|
293 |
|
294 |
except Exception as e:
|
295 |
# logger.error(f"Erreur dans la fonction compute: {str(e)}")
|
296 |
+
# logger.exception("Error details:")
|
297 |
|
298 |
+
# Return default result in case of error
|
299 |
return [{"accuracy": 0.0} for _ in sample_ids]
|
300 |
|
301 |
|
|
|
351 |
try:
|
352 |
extend_enum(Metrics, "accuracy", yourbench_metrics)
|
353 |
except Exception:
|
354 |
+
# Enum may have already been added, ignore error
|
355 |
pass
|
356 |
|
357 |
return LightevalTaskConfig(
|
backend/main.py
CHANGED
@@ -24,22 +24,22 @@ else:
|
|
24 |
|
25 |
app = FastAPI(title="Yourbench API")
|
26 |
|
27 |
-
#
|
28 |
app.add_middleware(
|
29 |
CORSMiddleware,
|
30 |
-
allow_origins=["*"], #
|
31 |
allow_credentials=True,
|
32 |
allow_methods=["*"],
|
33 |
allow_headers=["*"],
|
34 |
)
|
35 |
|
36 |
-
#
|
37 |
@app.on_event("startup")
|
38 |
async def startup_event():
|
39 |
print("\n===== Application Startup at", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
|
40 |
print(f"Initial session_files: {session_files}")
|
41 |
|
42 |
-
#
|
43 |
print("\n===== Environment Variables Check =====")
|
44 |
hf_token = os.environ.get("HF_TOKEN")
|
45 |
if hf_token:
|
@@ -61,14 +61,14 @@ async def startup_event():
|
|
61 |
print(" This may affect billing and access to certain models.")
|
62 |
|
63 |
print("\n===== Additional Environment Variables =====")
|
64 |
-
#
|
65 |
for env_var in ["PORT", "DEBUG", "PYTHONPATH", "VIRTUAL_ENV"]:
|
66 |
value = os.environ.get(env_var)
|
67 |
if value:
|
68 |
print(f"ℹ️ {env_var}: {value}")
|
69 |
print("=======================================\n")
|
70 |
|
71 |
-
#
|
72 |
print("===== Testing model availability at startup =====")
|
73 |
test_results = test_models(verbose=True)
|
74 |
print("===== Model testing completed =====")
|
@@ -82,6 +82,6 @@ async def startup_event():
|
|
82 |
print("3. Try again later as the API service might be temporarily unavailable")
|
83 |
print("4. Configure alternative models in config/models_config.py")
|
84 |
|
85 |
-
#
|
86 |
for router in routers:
|
87 |
app.include_router(router)
|
|
|
24 |
|
25 |
app = FastAPI(title="Yourbench API")
|
26 |
|
27 |
+
# Enable CORS to allow requests from frontend
|
28 |
app.add_middleware(
|
29 |
CORSMiddleware,
|
30 |
+
allow_origins=["*"], # In a production environment, specify exact origins
|
31 |
allow_credentials=True,
|
32 |
allow_methods=["*"],
|
33 |
allow_headers=["*"],
|
34 |
)
|
35 |
|
36 |
+
# Add an event handler to display session_files at startup
|
37 |
@app.on_event("startup")
|
38 |
async def startup_event():
|
39 |
print("\n===== Application Startup at", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
|
40 |
print(f"Initial session_files: {session_files}")
|
41 |
|
42 |
+
# Display detailed information about environment variables
|
43 |
print("\n===== Environment Variables Check =====")
|
44 |
hf_token = os.environ.get("HF_TOKEN")
|
45 |
if hf_token:
|
|
|
61 |
print(" This may affect billing and access to certain models.")
|
62 |
|
63 |
print("\n===== Additional Environment Variables =====")
|
64 |
+
# Display other useful variables
|
65 |
for env_var in ["PORT", "DEBUG", "PYTHONPATH", "VIRTUAL_ENV"]:
|
66 |
value = os.environ.get(env_var)
|
67 |
if value:
|
68 |
print(f"ℹ️ {env_var}: {value}")
|
69 |
print("=======================================\n")
|
70 |
|
71 |
+
# Test models at startup and display results
|
72 |
print("===== Testing model availability at startup =====")
|
73 |
test_results = test_models(verbose=True)
|
74 |
print("===== Model testing completed =====")
|
|
|
82 |
print("3. Try again later as the API service might be temporarily unavailable")
|
83 |
print("4. Configure alternative models in config/models_config.py")
|
84 |
|
85 |
+
# Register all routes
|
86 |
for router in routers:
|
87 |
app.include_router(router)
|
backend/routes/__init__.py
CHANGED
@@ -7,7 +7,7 @@ from .download import router as download_router
|
|
7 |
from .evaluation import router as evaluation_router, active_evaluation_tasks
|
8 |
from .cleanup import router as cleanup_router
|
9 |
|
10 |
-
#
|
11 |
routers = [
|
12 |
health_router,
|
13 |
upload_router,
|
@@ -18,9 +18,9 @@ routers = [
|
|
18 |
cleanup_router
|
19 |
]
|
20 |
|
21 |
-
#
|
22 |
benchmark_router.session_files = session_files
|
23 |
cleanup_router.session_files = session_files
|
24 |
|
25 |
-
#
|
26 |
__all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']
|
|
|
7 |
from .evaluation import router as evaluation_router, active_evaluation_tasks
|
8 |
from .cleanup import router as cleanup_router
|
9 |
|
10 |
+
# Expose the routeurs
|
11 |
routers = [
|
12 |
health_router,
|
13 |
upload_router,
|
|
|
18 |
cleanup_router
|
19 |
]
|
20 |
|
21 |
+
# Reference shared data between routes
|
22 |
benchmark_router.session_files = session_files
|
23 |
cleanup_router.session_files = session_files
|
24 |
|
25 |
+
# Expose shared variables for main.py
|
26 |
__all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']
|
backend/routes/benchmark.py
CHANGED
@@ -7,11 +7,11 @@ from tasks.create_bench import CreateBenchTask
|
|
7 |
|
8 |
router = APIRouter(tags=["benchmark"])
|
9 |
|
10 |
-
# Store active tasks by session_id (
|
11 |
active_tasks = {}
|
12 |
|
13 |
-
#
|
14 |
-
#
|
15 |
session_files = {}
|
16 |
|
17 |
@router.post("/generate-benchmark")
|
@@ -34,17 +34,17 @@ async def generate_benchmark(data: Dict[str, Any]):
|
|
34 |
if not session_id or session_id not in router.session_files:
|
35 |
return {"error": "Invalid or missing session ID"}
|
36 |
|
37 |
-
#
|
38 |
if session_id in active_tasks:
|
39 |
task = active_tasks[session_id]
|
40 |
-
#
|
41 |
if task.is_task_completed():
|
42 |
return {
|
43 |
"status": "already_completed",
|
44 |
"logs": task.get_logs(),
|
45 |
"is_completed": True
|
46 |
}
|
47 |
-
#
|
48 |
else:
|
49 |
return {
|
50 |
"status": "already_running",
|
@@ -56,16 +56,16 @@ async def generate_benchmark(data: Dict[str, Any]):
|
|
56 |
all_logs = []
|
57 |
|
58 |
try:
|
59 |
-
#
|
60 |
task = UnifiedBenchmarkTask(session_uid=session_id)
|
61 |
|
62 |
-
#
|
63 |
active_tasks[session_id] = task
|
64 |
|
65 |
-
#
|
66 |
task.run(file_path)
|
67 |
|
68 |
-
#
|
69 |
all_logs = task.get_logs()
|
70 |
|
71 |
return {
|
@@ -102,7 +102,7 @@ async def get_benchmark_progress(session_id: str):
|
|
102 |
"is_completed": is_completed
|
103 |
}
|
104 |
|
105 |
-
#
|
106 |
class UnifiedBenchmarkTask:
|
107 |
"""
|
108 |
Task that handles the entire benchmark process from configuration to completion
|
@@ -217,8 +217,8 @@ class UnifiedBenchmarkTask:
|
|
217 |
# Mark as completed
|
218 |
self.is_completed = True
|
219 |
|
220 |
-
#
|
221 |
-
#
|
222 |
has_error = any("[ERROR]" in log and not ("JSONDecodeError" in log or
|
223 |
"Error processing QA pair" in log or
|
224 |
"'str' object has no attribute 'get'" in log)
|
@@ -226,7 +226,7 @@ class UnifiedBenchmarkTask:
|
|
226 |
benchmark_terminated_with_error = any("Benchmark process terminated with error code" in log for log in final_logs)
|
227 |
benchmark_already_marked_success = any("Benchmark process completed successfully" in log for log in final_logs)
|
228 |
|
229 |
-
#
|
230 |
json_errors_only = any(("JSONDecodeError" in log or
|
231 |
"Error processing QA pair" in log or
|
232 |
"'str' object has no attribute 'get'" in log)
|
@@ -235,7 +235,7 @@ class UnifiedBenchmarkTask:
|
|
235 |
if json_errors_only:
|
236 |
self._add_log("[INFO] Benchmark completed with minor JSON parsing warnings, considered successful")
|
237 |
|
238 |
-
#
|
239 |
if (not has_error and not benchmark_terminated_with_error and not benchmark_already_marked_success) or json_errors_only:
|
240 |
self._add_log("[SUCCESS] Benchmark process completed successfully")
|
241 |
|
|
|
7 |
|
8 |
router = APIRouter(tags=["benchmark"])
|
9 |
|
10 |
+
# Store active tasks by session_id (imported in main.py)
|
11 |
active_tasks = {}
|
12 |
|
13 |
+
# Reference to session_files (will be provided by main.py)
|
14 |
+
# This declaration will be overwritten by assignment in __init__.py
|
15 |
session_files = {}
|
16 |
|
17 |
@router.post("/generate-benchmark")
|
|
|
34 |
if not session_id or session_id not in router.session_files:
|
35 |
return {"error": "Invalid or missing session ID"}
|
36 |
|
37 |
+
# Check if a benchmark is already in progress or completed for this session
|
38 |
if session_id in active_tasks:
|
39 |
task = active_tasks[session_id]
|
40 |
+
# If the benchmark is already completed, return existing logs
|
41 |
if task.is_task_completed():
|
42 |
return {
|
43 |
"status": "already_completed",
|
44 |
"logs": task.get_logs(),
|
45 |
"is_completed": True
|
46 |
}
|
47 |
+
# If the benchmark is running, return current logs
|
48 |
else:
|
49 |
return {
|
50 |
"status": "already_running",
|
|
|
56 |
all_logs = []
|
57 |
|
58 |
try:
|
59 |
+
# Initialize the task that will handle the entire process
|
60 |
task = UnifiedBenchmarkTask(session_uid=session_id)
|
61 |
|
62 |
+
# Storage for later log retrieval
|
63 |
active_tasks[session_id] = task
|
64 |
|
65 |
+
# Start the benchmark process
|
66 |
task.run(file_path)
|
67 |
|
68 |
+
# Get initial logs
|
69 |
all_logs = task.get_logs()
|
70 |
|
71 |
return {
|
|
|
102 |
"is_completed": is_completed
|
103 |
}
|
104 |
|
105 |
+
# Create a class that unifies the benchmark process
|
106 |
class UnifiedBenchmarkTask:
|
107 |
"""
|
108 |
Task that handles the entire benchmark process from configuration to completion
|
|
|
217 |
# Mark as completed
|
218 |
self.is_completed = True
|
219 |
|
220 |
+
# Check if an error was detected in the benchmark logs
|
221 |
+
# Specifically ignore JSON parsing errors that should not block the process
|
222 |
has_error = any("[ERROR]" in log and not ("JSONDecodeError" in log or
|
223 |
"Error processing QA pair" in log or
|
224 |
"'str' object has no attribute 'get'" in log)
|
|
|
226 |
benchmark_terminated_with_error = any("Benchmark process terminated with error code" in log for log in final_logs)
|
227 |
benchmark_already_marked_success = any("Benchmark process completed successfully" in log for log in final_logs)
|
228 |
|
229 |
+
# Even if there are JSON errors, consider the benchmark successful
|
230 |
json_errors_only = any(("JSONDecodeError" in log or
|
231 |
"Error processing QA pair" in log or
|
232 |
"'str' object has no attribute 'get'" in log)
|
|
|
235 |
if json_errors_only:
|
236 |
self._add_log("[INFO] Benchmark completed with minor JSON parsing warnings, considered successful")
|
237 |
|
238 |
+
# Only add success message if no serious errors were detected
|
239 |
if (not has_error and not benchmark_terminated_with_error and not benchmark_already_marked_success) or json_errors_only:
|
240 |
self._add_log("[SUCCESS] Benchmark process completed successfully")
|
241 |
|
backend/routes/evaluation.py
CHANGED
@@ -18,55 +18,55 @@ async def evaluate_benchmark(data: Dict[str, Any]):
|
|
18 |
Lancer l'évaluation d'un benchmark pour une session donnée
|
19 |
|
20 |
Args:
|
21 |
-
data: Dictionary
|
22 |
|
23 |
Returns:
|
24 |
-
Dictionary
|
25 |
"""
|
26 |
session_id = data.get("session_id")
|
27 |
|
28 |
if not session_id:
|
29 |
-
return {"error": "Session ID
|
30 |
|
31 |
-
#
|
32 |
if session_id in active_evaluation_tasks:
|
33 |
evaluation_task = active_evaluation_tasks[session_id]
|
34 |
-
#
|
35 |
if evaluation_task.is_task_completed():
|
36 |
-
#
|
37 |
del active_evaluation_tasks[session_id]
|
38 |
else:
|
39 |
-
#
|
40 |
return {
|
41 |
"status": "already_running",
|
42 |
-
"message": "
|
43 |
"logs": evaluation_task.get_logs()
|
44 |
}
|
45 |
|
46 |
try:
|
47 |
-
#
|
48 |
dataset_name = f"yourbench/yourbench_{session_id}"
|
49 |
|
50 |
-
#
|
51 |
evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
|
52 |
active_evaluation_tasks[session_id] = evaluation_task
|
53 |
|
54 |
-
#
|
55 |
asyncio.create_task(evaluation_task.run())
|
56 |
|
57 |
-
#
|
58 |
initial_logs = evaluation_task.get_logs()
|
59 |
|
60 |
return {
|
61 |
"status": "started",
|
62 |
-
"message": f"
|
63 |
"logs": initial_logs
|
64 |
}
|
65 |
except Exception as e:
|
66 |
return {
|
67 |
"status": "error",
|
68 |
"error": str(e),
|
69 |
-
"message": f"
|
70 |
}
|
71 |
|
72 |
@router.get("/evaluation-logs/{session_id}")
|
@@ -87,12 +87,12 @@ async def get_evaluation_logs(session_id: str):
|
|
87 |
logs = evaluation_task.get_logs()
|
88 |
is_completed = evaluation_task.is_task_completed()
|
89 |
|
90 |
-
#
|
91 |
results = None
|
92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
93 |
results = evaluation_task.results
|
94 |
|
95 |
-
#
|
96 |
progress = evaluation_task.get_progress()
|
97 |
|
98 |
return {
|
@@ -130,13 +130,13 @@ async def get_evaluation_results(session_id: str):
|
|
130 |
with open(results_file) as f:
|
131 |
results_data = json.load(f)
|
132 |
|
133 |
-
#
|
134 |
if "results" in results_data and isinstance(results_data["results"], list):
|
135 |
-
#
|
136 |
results_list = results_data["results"]
|
137 |
metadata = results_data.get("metadata", {})
|
138 |
else:
|
139 |
-
#
|
140 |
results_list = results_data
|
141 |
metadata = {}
|
142 |
|
|
|
18 |
Lancer l'évaluation d'un benchmark pour une session donnée
|
19 |
|
20 |
Args:
|
21 |
+
data: Dictionary containing session_id
|
22 |
|
23 |
Returns:
|
24 |
+
Dictionary with status and initial logs
|
25 |
"""
|
26 |
session_id = data.get("session_id")
|
27 |
|
28 |
if not session_id:
|
29 |
+
return {"error": "Session ID missing or invalid"}
|
30 |
|
31 |
+
# Check if an evaluation is already in progress for this session
|
32 |
if session_id in active_evaluation_tasks:
|
33 |
evaluation_task = active_evaluation_tasks[session_id]
|
34 |
+
# If the evaluation is already completed, we can start a new one
|
35 |
if evaluation_task.is_task_completed():
|
36 |
+
# Delete the old task
|
37 |
del active_evaluation_tasks[session_id]
|
38 |
else:
|
39 |
+
# An evaluation is already in progress
|
40 |
return {
|
41 |
"status": "already_running",
|
42 |
+
"message": "An evaluation is already in progress for this session",
|
43 |
"logs": evaluation_task.get_logs()
|
44 |
}
|
45 |
|
46 |
try:
|
47 |
+
# Dataset name based on session ID
|
48 |
dataset_name = f"yourbench/yourbench_{session_id}"
|
49 |
|
50 |
+
# Create and start a new evaluation task
|
51 |
evaluation_task = EvaluationTask(session_uid=session_id, dataset_name=dataset_name)
|
52 |
active_evaluation_tasks[session_id] = evaluation_task
|
53 |
|
54 |
+
# Start the evaluation asynchronously
|
55 |
asyncio.create_task(evaluation_task.run())
|
56 |
|
57 |
+
# Get initial logs
|
58 |
initial_logs = evaluation_task.get_logs()
|
59 |
|
60 |
return {
|
61 |
"status": "started",
|
62 |
+
"message": f"Evaluation started for benchmark {dataset_name}",
|
63 |
"logs": initial_logs
|
64 |
}
|
65 |
except Exception as e:
|
66 |
return {
|
67 |
"status": "error",
|
68 |
"error": str(e),
|
69 |
+
"message": f"Error starting evaluation: {str(e)}"
|
70 |
}
|
71 |
|
72 |
@router.get("/evaluation-logs/{session_id}")
|
|
|
87 |
logs = evaluation_task.get_logs()
|
88 |
is_completed = evaluation_task.is_task_completed()
|
89 |
|
90 |
+
# Get results if available and evaluation is completed
|
91 |
results = None
|
92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
93 |
results = evaluation_task.results
|
94 |
|
95 |
+
# Get step information
|
96 |
progress = evaluation_task.get_progress()
|
97 |
|
98 |
return {
|
|
|
130 |
with open(results_file) as f:
|
131 |
results_data = json.load(f)
|
132 |
|
133 |
+
# Check if results are in the new format or old format
|
134 |
if "results" in results_data and isinstance(results_data["results"], list):
|
135 |
+
# New format: { "metadata": ..., "results": [...] }
|
136 |
results_list = results_data["results"]
|
137 |
metadata = results_data.get("metadata", {})
|
138 |
else:
|
139 |
+
# Old format: [...] (list directly)
|
140 |
results_list = results_data
|
141 |
metadata = {}
|
142 |
|
backend/routes/upload.py
CHANGED
@@ -24,12 +24,12 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
|
24 |
# Minimum length for any file (in characters)
|
25 |
MIN_FILE_LENGTH = 500
|
26 |
|
27 |
-
#
|
28 |
-
MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max
|
29 |
-
REQUEST_TIMEOUT = 10 # Timeout
|
30 |
-
#
|
31 |
ALLOWED_DOMAINS: List[str] = []
|
32 |
-
#
|
33 |
BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
|
34 |
|
35 |
def validate_pdf(file_path: str) -> bool:
|
@@ -230,17 +230,17 @@ async def upload_url(url: str = Form(...)):
|
|
230 |
Dictionary with status and session_id
|
231 |
"""
|
232 |
try:
|
233 |
-
#
|
234 |
if not validators.url(url):
|
235 |
raise HTTPException(status_code=400, detail="Invalid URL format")
|
236 |
|
237 |
-
#
|
238 |
parsed_url = urlparse(url)
|
239 |
path = parsed_url.path.lower()
|
240 |
if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
|
241 |
raise HTTPException(status_code=400, detail="This file type is not allowed")
|
242 |
|
243 |
-
#
|
244 |
domain = parsed_url.netloc
|
245 |
if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
|
246 |
raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
|
@@ -256,11 +256,11 @@ async def upload_url(url: str = Form(...)):
|
|
256 |
url,
|
257 |
timeout=REQUEST_TIMEOUT,
|
258 |
headers=headers,
|
259 |
-
stream=True #
|
260 |
)
|
261 |
response.raise_for_status()
|
262 |
|
263 |
-
#
|
264 |
content_type = response.headers.get('Content-Type', '')
|
265 |
if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
|
266 |
raise HTTPException(
|
@@ -268,7 +268,7 @@ async def upload_url(url: str = Form(...)):
|
|
268 |
detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
|
269 |
)
|
270 |
|
271 |
-
#
|
272 |
content_length = int(response.headers.get('Content-Length', 0))
|
273 |
if content_length > MAX_CONTENT_SIZE:
|
274 |
raise HTTPException(
|
|
|
24 |
# Minimum length for any file (in characters)
|
25 |
MIN_FILE_LENGTH = 500
|
26 |
|
27 |
+
# Security limits configuration
|
28 |
+
MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max for uploaded content
|
29 |
+
REQUEST_TIMEOUT = 10 # Timeout for HTTP requests
|
30 |
+
# List of allowed domains (empty = all allowed, but should be filled in production)
|
31 |
ALLOWED_DOMAINS: List[str] = []
|
32 |
+
# List of file extensions to block in URLs
|
33 |
BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
|
34 |
|
35 |
def validate_pdf(file_path: str) -> bool:
|
|
|
230 |
Dictionary with status and session_id
|
231 |
"""
|
232 |
try:
|
233 |
+
# Validate that the URL is well-formed
|
234 |
if not validators.url(url):
|
235 |
raise HTTPException(status_code=400, detail="Invalid URL format")
|
236 |
|
237 |
+
# Check if URL has a blocked extension
|
238 |
parsed_url = urlparse(url)
|
239 |
path = parsed_url.path.lower()
|
240 |
if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
|
241 |
raise HTTPException(status_code=400, detail="This file type is not allowed")
|
242 |
|
243 |
+
# Check if domain is allowed (if list is not empty)
|
244 |
domain = parsed_url.netloc
|
245 |
if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
|
246 |
raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
|
|
|
256 |
url,
|
257 |
timeout=REQUEST_TIMEOUT,
|
258 |
headers=headers,
|
259 |
+
stream=True # To check size before downloading all content
|
260 |
)
|
261 |
response.raise_for_status()
|
262 |
|
263 |
+
# Check Content-Type
|
264 |
content_type = response.headers.get('Content-Type', '')
|
265 |
if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
|
266 |
raise HTTPException(
|
|
|
268 |
detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
|
269 |
)
|
270 |
|
271 |
+
# Check content size
|
272 |
content_length = int(response.headers.get('Content-Length', 0))
|
273 |
if content_length > MAX_CONTENT_SIZE:
|
274 |
raise HTTPException(
|
backend/tasks/create_bench.py
CHANGED
@@ -107,9 +107,9 @@ class CreateBenchTask:
|
|
107 |
"""
|
108 |
self._add_log("[INFO] Starting output capture")
|
109 |
|
110 |
-
# Flag
|
111 |
rate_limit_detected = False
|
112 |
-
# Flag
|
113 |
json_errors_detected = False
|
114 |
|
115 |
try:
|
@@ -135,14 +135,14 @@ class CreateBenchTask:
|
|
135 |
rate_limit_detected = True
|
136 |
self._add_log("[ERROR] RATE_LIMIT_EXCEEDED: The demo is under heavy load at the moment.")
|
137 |
|
138 |
-
#
|
139 |
if ("JSONDecodeError" in line or
|
140 |
"Error processing QA pair" in line or
|
141 |
"'str' object has no attribute 'get'" in line):
|
142 |
json_errors_detected = True
|
143 |
-
#
|
144 |
self._add_log(f"[WARN] Non-critical JSON error: {line}")
|
145 |
-
continue #
|
146 |
|
147 |
# Log raw output for debugging
|
148 |
self._add_log(f"[DEBUG] Raw output: {line}")
|
@@ -157,12 +157,12 @@ class CreateBenchTask:
|
|
157 |
else:
|
158 |
# Detect completed stages
|
159 |
if "Completed stage:" in line:
|
160 |
-
#
|
161 |
stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
|
162 |
-
#
|
163 |
stage = self._standardize_stage_name(stage)
|
164 |
self._add_log(f"[SUCCESS] Stage completed: {stage}")
|
165 |
-
#
|
166 |
elif "Successfully completed 'upload_ingest_to_hub' stage" in line:
|
167 |
self._add_log(f"[SUCCESS] Stage completed: upload_ingest_to_hub")
|
168 |
else:
|
@@ -172,22 +172,20 @@ class CreateBenchTask:
|
|
172 |
if self.process:
|
173 |
exit_code = self.process.poll()
|
174 |
if exit_code == 0 or json_errors_detected:
|
175 |
-
#
|
176 |
if json_errors_detected:
|
177 |
self._add_log("[INFO] Benchmark completed with non-critical JSON errors, considered successful")
|
178 |
else:
|
179 |
self._add_log("[SUCCESS] Benchmark process completed successfully")
|
180 |
else:
|
181 |
-
#
|
182 |
if rate_limit_detected:
|
183 |
self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
|
184 |
-
#
|
185 |
-
# self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
|
186 |
-
# Message informatif sur la fin du processus avec erreurs
|
187 |
self._add_log("[INFO] Benchmark process completed with errors")
|
188 |
except Exception as e:
|
189 |
self._add_log(f"[ERROR] Error during output capture: {str(e)}")
|
190 |
-
#
|
191 |
finally:
|
192 |
self.is_completed = True
|
193 |
self.is_running_flag.clear()
|
@@ -203,10 +201,10 @@ class CreateBenchTask:
|
|
203 |
Returns:
|
204 |
Standardized stage name
|
205 |
"""
|
206 |
-
#
|
|
|
|
|
207 |
stage_mapping = {
|
208 |
-
# Ajouter ici les correspondances nécessaires
|
209 |
-
# exemple: "original_name": "standardized_name"
|
210 |
"ingest": "ingestion",
|
211 |
"upload": "upload_ingest_to_hub",
|
212 |
"summarize": "summarization",
|
@@ -214,12 +212,12 @@ class CreateBenchTask:
|
|
214 |
"generate_questions": "single_shot_question_generation",
|
215 |
}
|
216 |
|
217 |
-
#
|
218 |
for key, value in stage_mapping.items():
|
219 |
if key in stage_name.lower():
|
220 |
return value
|
221 |
|
222 |
-
#
|
223 |
return stage_name
|
224 |
|
225 |
def run(self, token: Optional[str] = None) -> None:
|
|
|
107 |
"""
|
108 |
self._add_log("[INFO] Starting output capture")
|
109 |
|
110 |
+
# Flag to detect rate limiting errors
|
111 |
rate_limit_detected = False
|
112 |
+
# Flag to detect non-critical JSON errors
|
113 |
json_errors_detected = False
|
114 |
|
115 |
try:
|
|
|
135 |
rate_limit_detected = True
|
136 |
self._add_log("[ERROR] RATE_LIMIT_EXCEEDED: The demo is under heavy load at the moment.")
|
137 |
|
138 |
+
# Detect non-critical JSON errors
|
139 |
if ("JSONDecodeError" in line or
|
140 |
"Error processing QA pair" in line or
|
141 |
"'str' object has no attribute 'get'" in line):
|
142 |
json_errors_detected = True
|
143 |
+
# Do not mark them as errors but as warnings
|
144 |
self._add_log(f"[WARN] Non-critical JSON error: {line}")
|
145 |
+
continue # Skip to next line
|
146 |
|
147 |
# Log raw output for debugging
|
148 |
self._add_log(f"[DEBUG] Raw output: {line}")
|
|
|
157 |
else:
|
158 |
# Detect completed stages
|
159 |
if "Completed stage:" in line:
|
160 |
+
# Extract step name
|
161 |
stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
|
162 |
+
# Standardize step names to match frontend
|
163 |
stage = self._standardize_stage_name(stage)
|
164 |
self._add_log(f"[SUCCESS] Stage completed: {stage}")
|
165 |
+
# Specifically check completion of upload_ingest_to_hub step
|
166 |
elif "Successfully completed 'upload_ingest_to_hub' stage" in line:
|
167 |
self._add_log(f"[SUCCESS] Stage completed: upload_ingest_to_hub")
|
168 |
else:
|
|
|
172 |
if self.process:
|
173 |
exit_code = self.process.poll()
|
174 |
if exit_code == 0 or json_errors_detected:
|
175 |
+
# Consider process successful even with JSON errors
|
176 |
if json_errors_detected:
|
177 |
self._add_log("[INFO] Benchmark completed with non-critical JSON errors, considered successful")
|
178 |
else:
|
179 |
self._add_log("[SUCCESS] Benchmark process completed successfully")
|
180 |
else:
|
181 |
+
# If a rate limiting error was detected, display a specific message
|
182 |
if rate_limit_detected:
|
183 |
self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
|
184 |
+
# Do not add success message in case of exception
|
|
|
|
|
185 |
self._add_log("[INFO] Benchmark process completed with errors")
|
186 |
except Exception as e:
|
187 |
self._add_log(f"[ERROR] Error during output capture: {str(e)}")
|
188 |
+
# Do not add success message in case of exception
|
189 |
finally:
|
190 |
self.is_completed = True
|
191 |
self.is_running_flag.clear()
|
|
|
201 |
Returns:
|
202 |
Standardized stage name
|
203 |
"""
|
204 |
+
# Mapping table for step names
|
205 |
+
# Add necessary mappings here
|
206 |
+
# example: "original_name": "standardized_name"
|
207 |
stage_mapping = {
|
|
|
|
|
208 |
"ingest": "ingestion",
|
209 |
"upload": "upload_ingest_to_hub",
|
210 |
"summarize": "summarization",
|
|
|
212 |
"generate_questions": "single_shot_question_generation",
|
213 |
}
|
214 |
|
215 |
+
# Look for partial matches
|
216 |
for key, value in stage_mapping.items():
|
217 |
if key in stage_name.lower():
|
218 |
return value
|
219 |
|
220 |
+
# If no match is found, return original name
|
221 |
return stage_name
|
222 |
|
223 |
def run(self, token: Optional[str] = None) -> None:
|
backend/tasks/create_bench_config_file.py
CHANGED
@@ -65,7 +65,7 @@ class CreateBenchConfigTask:
|
|
65 |
Returns:
|
66 |
List of log messages
|
67 |
"""
|
68 |
-
return self.logs.copy() #
|
69 |
|
70 |
def save_uploaded_file(self, file_path: str) -> str:
|
71 |
"""
|
@@ -99,27 +99,27 @@ class CreateBenchConfigTask:
|
|
99 |
"""
|
100 |
self._add_log(f"[INFO] Finding available provider for {model_name}")
|
101 |
|
102 |
-
#
|
103 |
provider = get_available_model_provider(model_name, verbose=True)
|
104 |
|
105 |
if provider:
|
106 |
self._add_log(f"[INFO] Found provider for {model_name}: {provider}")
|
107 |
return provider
|
108 |
|
109 |
-
#
|
110 |
-
#
|
111 |
from huggingface_hub import model_info
|
112 |
from tasks.get_available_model_provider import test_provider
|
113 |
|
114 |
self._add_log(f"[WARNING] No preferred provider found for {model_name}, trying all available providers...")
|
115 |
|
116 |
try:
|
117 |
-
#
|
118 |
info = model_info(model_name, expand="inferenceProviderMapping")
|
119 |
if hasattr(info, "inference_provider_mapping"):
|
120 |
providers = list(info.inference_provider_mapping.keys())
|
121 |
|
122 |
-
#
|
123 |
other_providers = [p for p in providers if p not in PREFERRED_PROVIDERS]
|
124 |
|
125 |
if other_providers:
|
@@ -158,13 +158,13 @@ class CreateBenchConfigTask:
|
|
158 |
# Get provider for the default model
|
159 |
provider = self.get_model_provider(DEFAULT_BENCHMARK_MODEL)
|
160 |
|
161 |
-
#
|
162 |
selected_model = DEFAULT_BENCHMARK_MODEL
|
163 |
|
164 |
if not provider:
|
165 |
self._add_log(f"[WARNING] Primary model {DEFAULT_BENCHMARK_MODEL} not available. Trying alternatives...")
|
166 |
|
167 |
-
#
|
168 |
for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
|
169 |
self._add_log(f"[INFO] Trying alternative model: {alt_model}")
|
170 |
alt_provider = self.get_model_provider(alt_model)
|
@@ -174,7 +174,7 @@ class CreateBenchConfigTask:
|
|
174 |
provider = alt_provider
|
175 |
break
|
176 |
|
177 |
-
#
|
178 |
if not provider:
|
179 |
error_msg = "No model with available provider found. Cannot proceed with benchmark."
|
180 |
self._add_log(f"[ERROR] {error_msg}")
|
@@ -189,11 +189,11 @@ class CreateBenchConfigTask:
|
|
189 |
"max_concurrent_requests": 32,
|
190 |
}]
|
191 |
|
192 |
-
#
|
193 |
model_roles = dict(BENCHMARK_MODEL_ROLES)
|
194 |
if selected_model != DEFAULT_BENCHMARK_MODEL:
|
195 |
for role in model_roles:
|
196 |
-
if role != "chunking": #
|
197 |
model_roles[role] = [selected_model]
|
198 |
|
199 |
self._add_log(f"[INFO] Updated model roles to use {selected_model}")
|
@@ -351,7 +351,7 @@ class CreateBenchConfigTask:
|
|
351 |
# time.sleep(2) # Simulate delay
|
352 |
self._add_log("[SUCCESS] Stage completed: config_generation")
|
353 |
|
354 |
-
#
|
355 |
self.mark_task_completed()
|
356 |
|
357 |
return str(config_path)
|
|
|
65 |
Returns:
|
66 |
List of log messages
|
67 |
"""
|
68 |
+
return self.logs.copy() # Return a copy to avoid reference problems
|
69 |
|
70 |
def save_uploaded_file(self, file_path: str) -> str:
|
71 |
"""
|
|
|
99 |
"""
|
100 |
self._add_log(f"[INFO] Finding available provider for {model_name}")
|
101 |
|
102 |
+
# Try to find a provider for the model
|
103 |
provider = get_available_model_provider(model_name, verbose=True)
|
104 |
|
105 |
if provider:
|
106 |
self._add_log(f"[INFO] Found provider for {model_name}: {provider}")
|
107 |
return provider
|
108 |
|
109 |
+
# If no provider is found with the preferred configuration
|
110 |
+
# Let's try to find any available provider by ignoring the preference
|
111 |
from huggingface_hub import model_info
|
112 |
from tasks.get_available_model_provider import test_provider
|
113 |
|
114 |
self._add_log(f"[WARNING] No preferred provider found for {model_name}, trying all available providers...")
|
115 |
|
116 |
try:
|
117 |
+
# Get all possible providers for this model
|
118 |
info = model_info(model_name, expand="inferenceProviderMapping")
|
119 |
if hasattr(info, "inference_provider_mapping"):
|
120 |
providers = list(info.inference_provider_mapping.keys())
|
121 |
|
122 |
+
# Exclude preferred providers already tested
|
123 |
other_providers = [p for p in providers if p not in PREFERRED_PROVIDERS]
|
124 |
|
125 |
if other_providers:
|
|
|
158 |
# Get provider for the default model
|
159 |
provider = self.get_model_provider(DEFAULT_BENCHMARK_MODEL)
|
160 |
|
161 |
+
# If no provider is found for the default model, try alternative models
|
162 |
selected_model = DEFAULT_BENCHMARK_MODEL
|
163 |
|
164 |
if not provider:
|
165 |
self._add_log(f"[WARNING] Primary model {DEFAULT_BENCHMARK_MODEL} not available. Trying alternatives...")
|
166 |
|
167 |
+
# Use the list of alternative models from configuration
|
168 |
for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
|
169 |
self._add_log(f"[INFO] Trying alternative model: {alt_model}")
|
170 |
alt_provider = self.get_model_provider(alt_model)
|
|
|
174 |
provider = alt_provider
|
175 |
break
|
176 |
|
177 |
+
# If toujours pas de provider, lever une exception
|
178 |
if not provider:
|
179 |
error_msg = "No model with available provider found. Cannot proceed with benchmark."
|
180 |
self._add_log(f"[ERROR] {error_msg}")
|
|
|
189 |
"max_concurrent_requests": 32,
|
190 |
}]
|
191 |
|
192 |
+
# Update model roles if an alternative model is used
|
193 |
model_roles = dict(BENCHMARK_MODEL_ROLES)
|
194 |
if selected_model != DEFAULT_BENCHMARK_MODEL:
|
195 |
for role in model_roles:
|
196 |
+
if role != "chunking": # Do not change the chunking model
|
197 |
model_roles[role] = [selected_model]
|
198 |
|
199 |
self._add_log(f"[INFO] Updated model roles to use {selected_model}")
|
|
|
351 |
# time.sleep(2) # Simulate delay
|
352 |
self._add_log("[SUCCESS] Stage completed: config_generation")
|
353 |
|
354 |
+
# Task completed
|
355 |
self.mark_task_completed()
|
356 |
|
357 |
return str(config_path)
|
backend/tasks/get_available_model_provider.py
CHANGED
@@ -38,7 +38,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
|
|
38 |
if verbose:
|
39 |
logger.warning("No HF_TOKEN found in environment variables. This will likely cause authentication failures.")
|
40 |
print("WARNING: HF_TOKEN is missing. Most model providers require valid authentication.")
|
41 |
-
#
|
42 |
return _test_provider_without_token(model_name, provider, verbose)
|
43 |
|
44 |
# Get HF organization from environment
|
@@ -82,7 +82,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
|
|
82 |
elif "status_code=401" in error_message or "status_code=403" in error_message:
|
83 |
logger.warning(f"Authentication failed for provider {provider}. Your HF_TOKEN may be invalid or expired.")
|
84 |
print(f"Authentication error with provider {provider}. Please check your HF_TOKEN.")
|
85 |
-
#
|
86 |
if verbose:
|
87 |
logger.info(f"Trying provider {provider} without authentication")
|
88 |
return _test_provider_without_token(model_name, provider, verbose)
|
@@ -93,7 +93,7 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
|
|
93 |
return False
|
94 |
except Exception as auth_error:
|
95 |
if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
|
96 |
-
#
|
97 |
if verbose:
|
98 |
logger.warning(f"Authentication error with {provider}: {str(auth_error)}. Your HF_TOKEN may be invalid.")
|
99 |
print(f"Authentication error detected. Please verify your HF_TOKEN is valid and has appropriate permissions.")
|
@@ -110,15 +110,15 @@ def test_provider(model_name: str, provider: str, verbose: bool = False) -> bool
|
|
110 |
|
111 |
def _test_provider_without_token(model_name: str, provider: str, verbose: bool = False) -> bool:
|
112 |
"""
|
113 |
-
|
114 |
|
115 |
Args:
|
116 |
-
model_name:
|
117 |
-
provider: Provider
|
118 |
-
verbose:
|
119 |
|
120 |
Returns:
|
121 |
-
True
|
122 |
"""
|
123 |
try:
|
124 |
if verbose:
|
@@ -175,48 +175,48 @@ def get_available_model_provider(model_name, verbose=False):
|
|
175 |
# Get providers for the model and prioritize them
|
176 |
info = None
|
177 |
try:
|
178 |
-
#
|
179 |
try:
|
180 |
if verbose:
|
181 |
logger.info(f"Trying to get model info for {model_name} with auth token")
|
182 |
info = model_info(model_name, token=hf_token, expand="inferenceProviderMapping")
|
183 |
except Exception as auth_error:
|
184 |
-
#
|
185 |
if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
|
186 |
if verbose:
|
187 |
logger.warning(f"Authentication failed for {model_name}, trying without token")
|
188 |
-
#
|
189 |
try:
|
190 |
info = model_info(model_name, expand="inferenceProviderMapping")
|
191 |
except Exception as e:
|
192 |
if verbose:
|
193 |
logger.error(f"Failed to get model info without token: {str(e)}")
|
194 |
-
#
|
195 |
if verbose:
|
196 |
logger.warning(f"Using default providers list as fallback for {model_name}")
|
197 |
-
#
|
198 |
return _test_fallback_providers(model_name, verbose)
|
199 |
else:
|
200 |
-
#
|
201 |
raise auth_error
|
202 |
|
203 |
if not info or not hasattr(info, "inference_provider_mapping"):
|
204 |
if verbose:
|
205 |
logger.info(f"No inference providers found for {model_name}")
|
206 |
-
#
|
207 |
return _test_fallback_providers(model_name, verbose)
|
208 |
|
209 |
providers = list(info.inference_provider_mapping.keys())
|
210 |
if not providers:
|
211 |
if verbose:
|
212 |
logger.info(f"Empty list of providers for {model_name}")
|
213 |
-
#
|
214 |
return _test_fallback_providers(model_name, verbose)
|
215 |
|
216 |
except Exception as e:
|
217 |
if verbose:
|
218 |
logger.error(f"Error retrieving model info for {model_name}: {str(e)}")
|
219 |
-
#
|
220 |
return _test_fallback_providers(model_name, verbose)
|
221 |
|
222 |
# Prioritize providers
|
@@ -277,22 +277,22 @@ def get_available_model_provider(model_name, verbose=False):
|
|
277 |
|
278 |
def _test_fallback_providers(model_name, verbose=False):
|
279 |
"""
|
280 |
-
|
281 |
|
282 |
Args:
|
283 |
-
model_name:
|
284 |
-
verbose:
|
285 |
|
286 |
Returns:
|
287 |
-
|
288 |
"""
|
289 |
-
#
|
290 |
default_providers = ["huggingface", "sambanova", "novita", "fireworks-ai", "together", "openai", "anthropic"]
|
291 |
|
292 |
if verbose:
|
293 |
logger.warning(f"Using fallback providers list for {model_name}: {', '.join(default_providers)}")
|
294 |
|
295 |
-
#
|
296 |
for provider in default_providers:
|
297 |
if verbose:
|
298 |
logger.info(f"Testing fallback provider {provider} for {model_name}")
|
@@ -309,13 +309,13 @@ def _test_fallback_providers(model_name, verbose=False):
|
|
309 |
|
310 |
def test_models(verbose=True):
|
311 |
"""
|
312 |
-
Test
|
313 |
|
314 |
Args:
|
315 |
-
verbose:
|
316 |
|
317 |
Returns:
|
318 |
-
|
319 |
"""
|
320 |
results = {
|
321 |
"default_model": None,
|
@@ -327,22 +327,22 @@ def test_models(verbose=True):
|
|
327 |
}
|
328 |
|
329 |
print("\n===== Checking HuggingFace Authentication =====")
|
330 |
-
#
|
331 |
hf_token = os.environ.get("HF_TOKEN")
|
332 |
if hf_token:
|
333 |
print("✅ HF_TOKEN is available")
|
334 |
|
335 |
-
#
|
336 |
if not hf_token.startswith("hf_"):
|
337 |
print("⚠️ WARNING: Your HF_TOKEN does not start with 'hf_' which is unusual. Please verify its format.")
|
338 |
|
339 |
-
#
|
340 |
masked_token = "••••••••••"
|
341 |
|
342 |
-
#
|
343 |
import requests
|
344 |
try:
|
345 |
-
# Test
|
346 |
test_model = "gpt2"
|
347 |
api_url = f"https://api-inference.huggingface.co/models/{test_model}"
|
348 |
|
@@ -353,13 +353,13 @@ def test_models(verbose=True):
|
|
353 |
|
354 |
response = requests.post(api_url, headers=headers, json=payload, timeout=10)
|
355 |
|
356 |
-
if response.status_code in [200, 503]: # 503 =
|
357 |
print(f"✅ HF_TOKEN validated - Token accepted by the inference API! Status: {response.status_code}")
|
358 |
if response.status_code == 503:
|
359 |
print("ℹ️ Model is loading, but token is valid")
|
360 |
|
361 |
-
#
|
362 |
-
#
|
363 |
try:
|
364 |
whoami_response = requests.get(
|
365 |
"https://huggingface.co/api/whoami",
|
@@ -370,13 +370,13 @@ def test_models(verbose=True):
|
|
370 |
user_info = whoami_response.json()
|
371 |
print(f"✅ Additional info - Authenticated as: {user_info.get('name', 'Unknown user')}")
|
372 |
|
373 |
-
#
|
374 |
if user_info.get('canPay', False):
|
375 |
print("✅ Your account has payment methods configured - you may have access to premium models")
|
376 |
else:
|
377 |
print("ℹ️ Your account does not have payment methods configured - access to premium models may be limited")
|
378 |
except Exception:
|
379 |
-
#
|
380 |
pass
|
381 |
else:
|
382 |
print(f"❌ HF_TOKEN validation failed with status code: {response.status_code}")
|
@@ -391,7 +391,7 @@ def test_models(verbose=True):
|
|
391 |
|
392 |
print("⚠️ Most model providers will not work with invalid credentials")
|
393 |
|
394 |
-
#
|
395 |
try:
|
396 |
print("Attempting alternative validation with status endpoint...")
|
397 |
status_url = "https://api-inference.huggingface.co/status"
|
@@ -409,7 +409,7 @@ def test_models(verbose=True):
|
|
409 |
print("❌ HF_TOKEN is missing - authentication to HuggingFace API will fail")
|
410 |
print("⚠️ Most models and providers require authentication")
|
411 |
|
412 |
-
#
|
413 |
hf_organization = os.environ.get("HF_ORGANIZATION")
|
414 |
if hf_organization:
|
415 |
print(f"✅ HF_ORGANIZATION is available: {hf_organization}")
|
@@ -419,7 +419,7 @@ def test_models(verbose=True):
|
|
419 |
if verbose:
|
420 |
print(f"\n===== Testing main default model: {DEFAULT_BENCHMARK_MODEL} =====")
|
421 |
|
422 |
-
# Test
|
423 |
provider = get_available_model_provider(DEFAULT_BENCHMARK_MODEL, verbose=verbose)
|
424 |
|
425 |
if provider:
|
@@ -433,7 +433,7 @@ def test_models(verbose=True):
|
|
433 |
print(f"\n❌ DEFAULT MODEL FAILED: No provider found for {DEFAULT_BENCHMARK_MODEL}")
|
434 |
print("Trying alternative models...")
|
435 |
|
436 |
-
#
|
437 |
for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
|
438 |
if verbose:
|
439 |
print(f"\nTrying alternative model: {alt_model}")
|
@@ -452,7 +452,7 @@ def test_models(verbose=True):
|
|
452 |
print("\n⚠️ This is likely due to authentication issues with your HF_TOKEN")
|
453 |
print("⚠️ Please check your token or try using models that don't require authentication")
|
454 |
|
455 |
-
#
|
456 |
models = [
|
457 |
"Qwen/QwQ-32B",
|
458 |
"Qwen/Qwen2.5-72B-Instruct",
|
@@ -495,5 +495,5 @@ def test_models(verbose=True):
|
|
495 |
return results
|
496 |
|
497 |
if __name__ == "__main__":
|
498 |
-
#
|
499 |
test_results = test_models(verbose=True)
|
|
|
38 |
if verbose:
|
39 |
logger.warning("No HF_TOKEN found in environment variables. This will likely cause authentication failures.")
|
40 |
print("WARNING: HF_TOKEN is missing. Most model providers require valid authentication.")
|
41 |
+
# Try without token (for providers that accept anonymous requests)
|
42 |
return _test_provider_without_token(model_name, provider, verbose)
|
43 |
|
44 |
# Get HF organization from environment
|
|
|
82 |
elif "status_code=401" in error_message or "status_code=403" in error_message:
|
83 |
logger.warning(f"Authentication failed for provider {provider}. Your HF_TOKEN may be invalid or expired.")
|
84 |
print(f"Authentication error with provider {provider}. Please check your HF_TOKEN.")
|
85 |
+
# If authentication fails, try without token (for public models)
|
86 |
if verbose:
|
87 |
logger.info(f"Trying provider {provider} without authentication")
|
88 |
return _test_provider_without_token(model_name, provider, verbose)
|
|
|
93 |
return False
|
94 |
except Exception as auth_error:
|
95 |
if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
|
96 |
+
# If authentication fails, try without token (for public models)
|
97 |
if verbose:
|
98 |
logger.warning(f"Authentication error with {provider}: {str(auth_error)}. Your HF_TOKEN may be invalid.")
|
99 |
print(f"Authentication error detected. Please verify your HF_TOKEN is valid and has appropriate permissions.")
|
|
|
110 |
|
111 |
def _test_provider_without_token(model_name: str, provider: str, verbose: bool = False) -> bool:
|
112 |
"""
|
113 |
+
Try to test a provider without authentication token
|
114 |
|
115 |
Args:
|
116 |
+
model_name: Name of the model
|
117 |
+
provider: Provider to test
|
118 |
+
verbose: Display detailed logs
|
119 |
|
120 |
Returns:
|
121 |
+
True if provider is available, False otherwise
|
122 |
"""
|
123 |
try:
|
124 |
if verbose:
|
|
|
175 |
# Get providers for the model and prioritize them
|
176 |
info = None
|
177 |
try:
|
178 |
+
# Try with token
|
179 |
try:
|
180 |
if verbose:
|
181 |
logger.info(f"Trying to get model info for {model_name} with auth token")
|
182 |
info = model_info(model_name, token=hf_token, expand="inferenceProviderMapping")
|
183 |
except Exception as auth_error:
|
184 |
+
# If authentication fails, try without token (for public models)
|
185 |
if "401" in str(auth_error) or "Unauthorized" in str(auth_error):
|
186 |
if verbose:
|
187 |
logger.warning(f"Authentication failed for {model_name}, trying without token")
|
188 |
+
# Try to get info without token
|
189 |
try:
|
190 |
info = model_info(model_name, expand="inferenceProviderMapping")
|
191 |
except Exception as e:
|
192 |
if verbose:
|
193 |
logger.error(f"Failed to get model info without token: {str(e)}")
|
194 |
+
# As a last resort, return the default provider list to test
|
195 |
if verbose:
|
196 |
logger.warning(f"Using default providers list as fallback for {model_name}")
|
197 |
+
# Try with default provider list
|
198 |
return _test_fallback_providers(model_name, verbose)
|
199 |
else:
|
200 |
+
# Other error, re-raise
|
201 |
raise auth_error
|
202 |
|
203 |
if not info or not hasattr(info, "inference_provider_mapping"):
|
204 |
if verbose:
|
205 |
logger.info(f"No inference providers found for {model_name}")
|
206 |
+
# Try with default provider list
|
207 |
return _test_fallback_providers(model_name, verbose)
|
208 |
|
209 |
providers = list(info.inference_provider_mapping.keys())
|
210 |
if not providers:
|
211 |
if verbose:
|
212 |
logger.info(f"Empty list of providers for {model_name}")
|
213 |
+
# Try with default provider list
|
214 |
return _test_fallback_providers(model_name, verbose)
|
215 |
|
216 |
except Exception as e:
|
217 |
if verbose:
|
218 |
logger.error(f"Error retrieving model info for {model_name}: {str(e)}")
|
219 |
+
# Try with default provider list
|
220 |
return _test_fallback_providers(model_name, verbose)
|
221 |
|
222 |
# Prioritize providers
|
|
|
277 |
|
278 |
def _test_fallback_providers(model_name, verbose=False):
|
279 |
"""
|
280 |
+
Fallback function that tests a list of common providers without going through the API
|
281 |
|
282 |
Args:
|
283 |
+
model_name: Name of the model
|
284 |
+
verbose: Display detailed logs
|
285 |
|
286 |
Returns:
|
287 |
+
The first available provider or None
|
288 |
"""
|
289 |
+
# List of providers to test directly
|
290 |
default_providers = ["huggingface", "sambanova", "novita", "fireworks-ai", "together", "openai", "anthropic"]
|
291 |
|
292 |
if verbose:
|
293 |
logger.warning(f"Using fallback providers list for {model_name}: {', '.join(default_providers)}")
|
294 |
|
295 |
+
# Test each provider directly
|
296 |
for provider in default_providers:
|
297 |
if verbose:
|
298 |
logger.info(f"Testing fallback provider {provider} for {model_name}")
|
|
|
309 |
|
310 |
def test_models(verbose=True):
|
311 |
"""
|
312 |
+
Test the default model and alternative models, then return a summary of results.
|
313 |
|
314 |
Args:
|
315 |
+
verbose: Display detailed logs
|
316 |
|
317 |
Returns:
|
318 |
+
A dictionary with test results
|
319 |
"""
|
320 |
results = {
|
321 |
"default_model": None,
|
|
|
327 |
}
|
328 |
|
329 |
print("\n===== Checking HuggingFace Authentication =====")
|
330 |
+
# Get HF token
|
331 |
hf_token = os.environ.get("HF_TOKEN")
|
332 |
if hf_token:
|
333 |
print("✅ HF_TOKEN is available")
|
334 |
|
335 |
+
# Check if token has a valid format (simple check)
|
336 |
if not hf_token.startswith("hf_"):
|
337 |
print("⚠️ WARNING: Your HF_TOKEN does not start with 'hf_' which is unusual. Please verify its format.")
|
338 |
|
339 |
+
# Don't show any token characters, just indicate its presence
|
340 |
masked_token = "••••••••••"
|
341 |
|
342 |
+
# Check token validity by testing inference API directly
|
343 |
import requests
|
344 |
try:
|
345 |
+
# Test with a simple public model (gpt2)
|
346 |
test_model = "gpt2"
|
347 |
api_url = f"https://api-inference.huggingface.co/models/{test_model}"
|
348 |
|
|
|
353 |
|
354 |
response = requests.post(api_url, headers=headers, json=payload, timeout=10)
|
355 |
|
356 |
+
if response.status_code in [200, 503]: # 503 = model is loading, but token is accepted
|
357 |
print(f"✅ HF_TOKEN validated - Token accepted by the inference API! Status: {response.status_code}")
|
358 |
if response.status_code == 503:
|
359 |
print("ℹ️ Model is loading, but token is valid")
|
360 |
|
361 |
+
# If token is valid for inference API, also check if we can get
|
362 |
+
# user information (but not blocking if it fails)
|
363 |
try:
|
364 |
whoami_response = requests.get(
|
365 |
"https://huggingface.co/api/whoami",
|
|
|
370 |
user_info = whoami_response.json()
|
371 |
print(f"✅ Additional info - Authenticated as: {user_info.get('name', 'Unknown user')}")
|
372 |
|
373 |
+
# Check if user has access to paid models
|
374 |
if user_info.get('canPay', False):
|
375 |
print("✅ Your account has payment methods configured - you may have access to premium models")
|
376 |
else:
|
377 |
print("ℹ️ Your account does not have payment methods configured - access to premium models may be limited")
|
378 |
except Exception:
|
379 |
+
# Ignore errors when getting user info
|
380 |
pass
|
381 |
else:
|
382 |
print(f"❌ HF_TOKEN validation failed with status code: {response.status_code}")
|
|
|
391 |
|
392 |
print("⚠️ Most model providers will not work with invalid credentials")
|
393 |
|
394 |
+
# Alternative test with status endpoint
|
395 |
try:
|
396 |
print("Attempting alternative validation with status endpoint...")
|
397 |
status_url = "https://api-inference.huggingface.co/status"
|
|
|
409 |
print("❌ HF_TOKEN is missing - authentication to HuggingFace API will fail")
|
410 |
print("⚠️ Most models and providers require authentication")
|
411 |
|
412 |
+
# Get HF organization
|
413 |
hf_organization = os.environ.get("HF_ORGANIZATION")
|
414 |
if hf_organization:
|
415 |
print(f"✅ HF_ORGANIZATION is available: {hf_organization}")
|
|
|
419 |
if verbose:
|
420 |
print(f"\n===== Testing main default model: {DEFAULT_BENCHMARK_MODEL} =====")
|
421 |
|
422 |
+
# Test the default model
|
423 |
provider = get_available_model_provider(DEFAULT_BENCHMARK_MODEL, verbose=verbose)
|
424 |
|
425 |
if provider:
|
|
|
433 |
print(f"\n❌ DEFAULT MODEL FAILED: No provider found for {DEFAULT_BENCHMARK_MODEL}")
|
434 |
print("Trying alternative models...")
|
435 |
|
436 |
+
# Try alternative models
|
437 |
for alt_model in ALTERNATIVE_BENCHMARK_MODELS:
|
438 |
if verbose:
|
439 |
print(f"\nTrying alternative model: {alt_model}")
|
|
|
452 |
print("\n⚠️ This is likely due to authentication issues with your HF_TOKEN")
|
453 |
print("⚠️ Please check your token or try using models that don't require authentication")
|
454 |
|
455 |
+
# Test all models to get an overview
|
456 |
models = [
|
457 |
"Qwen/QwQ-32B",
|
458 |
"Qwen/Qwen2.5-72B-Instruct",
|
|
|
495 |
return results
|
496 |
|
497 |
if __name__ == "__main__":
|
498 |
+
# Run test if script is run directly
|
499 |
test_results = test_models(verbose=True)
|
backend/tests/check_hf_token.py
CHANGED
@@ -2,8 +2,8 @@
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
-
|
6 |
-
|
7 |
"""
|
8 |
|
9 |
import os
|
@@ -42,16 +42,16 @@ def info(text):
|
|
42 |
|
43 |
def check_token_via_inference_api(token=None, verbose=True):
|
44 |
"""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
Args:
|
50 |
-
token:
|
51 |
-
verbose:
|
52 |
|
53 |
Returns:
|
54 |
-
dict:
|
55 |
"""
|
56 |
results = {
|
57 |
"is_valid": False,
|
@@ -69,21 +69,21 @@ def check_token_via_inference_api(token=None, verbose=True):
|
|
69 |
results["error_message"] = "No token provided"
|
70 |
return results
|
71 |
|
72 |
-
#
|
73 |
masked_token = "••••••••••"
|
74 |
results["token"] = masked_token
|
75 |
|
76 |
print(info(f"Token à vérifier: {masked_token}"))
|
77 |
|
78 |
-
# 2.
|
79 |
if not token.startswith("hf_"):
|
80 |
print(warning("Le token ne commence pas par 'hf_' ce qui est inhabituel. Vérifiez son format."))
|
81 |
else:
|
82 |
print(success("Format du token valide (commence par 'hf_')"))
|
83 |
|
84 |
-
# 3.
|
85 |
try:
|
86 |
-
# Test
|
87 |
test_model = "gpt2"
|
88 |
api_url = f"https://api-inference.huggingface.co/models/{test_model}"
|
89 |
|
@@ -94,7 +94,7 @@ def check_token_via_inference_api(token=None, verbose=True):
|
|
94 |
|
95 |
response = requests.post(api_url, headers=headers, json=payload, timeout=10)
|
96 |
|
97 |
-
if response.status_code in [200, 503]: # 503
|
98 |
print(success(f"Token valide pour l'API d'inférence! Status code: {response.status_code}"))
|
99 |
if response.status_code == 503:
|
100 |
print(info("Le modèle est en cours de chargement. Le token a bien été accepté par l'API."))
|
@@ -116,7 +116,7 @@ def check_token_via_inference_api(token=None, verbose=True):
|
|
116 |
except:
|
117 |
print(error(f"Message d'erreur: {response.text}"))
|
118 |
|
119 |
-
#
|
120 |
try:
|
121 |
print(info("Test alternatif avec la liste des modèles déployés..."))
|
122 |
list_url = "https://api-inference.huggingface.co/status"
|
@@ -135,17 +135,17 @@ def check_token_via_inference_api(token=None, verbose=True):
|
|
135 |
print(error(f"Erreur lors du test de l'API d'inférence: {str(e)}"))
|
136 |
results["error_message"] = str(e)
|
137 |
|
138 |
-
# 4.
|
139 |
if results["is_valid"]:
|
140 |
try:
|
141 |
print(info("\nTest des permissions du token..."))
|
142 |
|
143 |
-
#
|
144 |
if os.environ.get("HF_ORGANIZATION"):
|
145 |
org = os.environ.get("HF_ORGANIZATION")
|
146 |
print(info(f"Test d'accès aux modèles de l'organisation {org}..."))
|
147 |
|
148 |
-
#
|
149 |
org_url = f"https://huggingface.co/api/models?author={org}"
|
150 |
org_response = requests.get(org_url, headers=headers, timeout=10)
|
151 |
|
@@ -161,15 +161,15 @@ def check_token_via_inference_api(token=None, verbose=True):
|
|
161 |
|
162 |
def check_model_access(token, model, verbose=False):
|
163 |
"""
|
164 |
-
|
165 |
|
166 |
Args:
|
167 |
-
token:
|
168 |
-
model:
|
169 |
-
verbose:
|
170 |
|
171 |
Returns:
|
172 |
-
bool: True
|
173 |
"""
|
174 |
print(f"\n" + info(f"Test d'accès au modèle: {model}"))
|
175 |
|
@@ -177,7 +177,7 @@ def check_model_access(token, model, verbose=False):
|
|
177 |
"Authorization": f"Bearer {token}"
|
178 |
}
|
179 |
|
180 |
-
# 1.
|
181 |
try:
|
182 |
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
183 |
payload = {"inputs": "Hello, test access"}
|
@@ -186,7 +186,7 @@ def check_model_access(token, model, verbose=False):
|
|
186 |
|
187 |
response = requests.post(api_url, headers=headers, json=payload, timeout=20)
|
188 |
|
189 |
-
if response.status_code in [200, 503]: # 503 =
|
190 |
if response.status_code == 200:
|
191 |
print(success(f"Accès réussi à l'API d'inférence pour {model}"))
|
192 |
return True
|
@@ -210,7 +210,7 @@ def check_model_access(token, model, verbose=False):
|
|
210 |
print(warning("Possible problème de quota ou de limite de taux"))
|
211 |
elif "loading" in error_message.lower():
|
212 |
print(info("Le modèle est en cours de chargement - réessayez plus tard"))
|
213 |
-
return True #
|
214 |
elif "permission" in error_message.lower() or "access" in error_message.lower():
|
215 |
print(error("Problème de permissions - vous n'avez pas accès à ce modèle"))
|
216 |
|
@@ -250,16 +250,16 @@ def main():
|
|
250 |
|
251 |
args = parser.parse_args()
|
252 |
|
253 |
-
#
|
254 |
load_dotenv()
|
255 |
|
256 |
print(info(f"=== Vérification de Token Hugging Face - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n"))
|
257 |
|
258 |
-
#
|
259 |
token = args.token or os.environ.get("HF_TOKEN")
|
260 |
token_info = check_token_via_inference_api(token, args.verbose)
|
261 |
|
262 |
-
#
|
263 |
if token_info["is_valid"]:
|
264 |
if args.test_model:
|
265 |
check_model_access(token, args.test_model, args.verbose)
|
|
|
2 |
# -*- coding: utf-8 -*-
|
3 |
|
4 |
"""
|
5 |
+
Standalone script to check and display Hugging Face token properties.
|
6 |
+
This script can be run separately to diagnose authentication issues.
|
7 |
"""
|
8 |
|
9 |
import os
|
|
|
42 |
|
43 |
def check_token_via_inference_api(token=None, verbose=True):
|
44 |
"""
|
45 |
+
Check the validity of an HF token by directly testing the inference API.
|
46 |
+
The whoami API doesn't always work correctly for tokens but the inference API
|
47 |
+
is the priority in our application.
|
48 |
|
49 |
Args:
|
50 |
+
token: The token to check
|
51 |
+
verbose: Display detailed information
|
52 |
|
53 |
Returns:
|
54 |
+
dict: Check results
|
55 |
"""
|
56 |
results = {
|
57 |
"is_valid": False,
|
|
|
69 |
results["error_message"] = "No token provided"
|
70 |
return results
|
71 |
|
72 |
+
# Don't show any token characters, just indicate its presence
|
73 |
masked_token = "••••••••••"
|
74 |
results["token"] = masked_token
|
75 |
|
76 |
print(info(f"Token à vérifier: {masked_token}"))
|
77 |
|
78 |
+
# 2. Check basic format
|
79 |
if not token.startswith("hf_"):
|
80 |
print(warning("Le token ne commence pas par 'hf_' ce qui est inhabituel. Vérifiez son format."))
|
81 |
else:
|
82 |
print(success("Format du token valide (commence par 'hf_')"))
|
83 |
|
84 |
+
# 3. Test inference API directly - recommended method to validate a token
|
85 |
try:
|
86 |
+
# Test with a simple public model
|
87 |
test_model = "gpt2"
|
88 |
api_url = f"https://api-inference.huggingface.co/models/{test_model}"
|
89 |
|
|
|
94 |
|
95 |
response = requests.post(api_url, headers=headers, json=payload, timeout=10)
|
96 |
|
97 |
+
if response.status_code in [200, 503]: # 503 means the model is loading, but the token is valid
|
98 |
print(success(f"Token valide pour l'API d'inférence! Status code: {response.status_code}"))
|
99 |
if response.status_code == 503:
|
100 |
print(info("Le modèle est en cours de chargement. Le token a bien été accepté par l'API."))
|
|
|
116 |
except:
|
117 |
print(error(f"Message d'erreur: {response.text}"))
|
118 |
|
119 |
+
# In case of failure, also test the model list endpoint
|
120 |
try:
|
121 |
print(info("Test alternatif avec la liste des modèles déployés..."))
|
122 |
list_url = "https://api-inference.huggingface.co/status"
|
|
|
135 |
print(error(f"Erreur lors du test de l'API d'inférence: {str(e)}"))
|
136 |
results["error_message"] = str(e)
|
137 |
|
138 |
+
# 4. Additional permission tests
|
139 |
if results["is_valid"]:
|
140 |
try:
|
141 |
print(info("\nTest des permissions du token..."))
|
142 |
|
143 |
+
# Test if we can access organization's private models
|
144 |
if os.environ.get("HF_ORGANIZATION"):
|
145 |
org = os.environ.get("HF_ORGANIZATION")
|
146 |
print(info(f"Test d'accès aux modèles de l'organisation {org}..."))
|
147 |
|
148 |
+
# Just check if we can access the organization's model list
|
149 |
org_url = f"https://huggingface.co/api/models?author={org}"
|
150 |
org_response = requests.get(org_url, headers=headers, timeout=10)
|
151 |
|
|
|
161 |
|
162 |
def check_model_access(token, model, verbose=False):
|
163 |
"""
|
164 |
+
Check if the token has access to a specific model.
|
165 |
|
166 |
Args:
|
167 |
+
token: HF token to check
|
168 |
+
model: Name of the model to test
|
169 |
+
verbose: Display detailed information
|
170 |
|
171 |
Returns:
|
172 |
+
bool: True if model is accessible, False otherwise
|
173 |
"""
|
174 |
print(f"\n" + info(f"Test d'accès au modèle: {model}"))
|
175 |
|
|
|
177 |
"Authorization": f"Bearer {token}"
|
178 |
}
|
179 |
|
180 |
+
# 1. Check if the model exists and is accessible via inference API
|
181 |
try:
|
182 |
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
183 |
payload = {"inputs": "Hello, test access"}
|
|
|
186 |
|
187 |
response = requests.post(api_url, headers=headers, json=payload, timeout=20)
|
188 |
|
189 |
+
if response.status_code in [200, 503]: # 503 = model is loading, but token is valid
|
190 |
if response.status_code == 200:
|
191 |
print(success(f"Accès réussi à l'API d'inférence pour {model}"))
|
192 |
return True
|
|
|
210 |
print(warning("Possible problème de quota ou de limite de taux"))
|
211 |
elif "loading" in error_message.lower():
|
212 |
print(info("Le modèle est en cours de chargement - réessayez plus tard"))
|
213 |
+
return True # Consider as success because token is accepted
|
214 |
elif "permission" in error_message.lower() or "access" in error_message.lower():
|
215 |
print(error("Problème de permissions - vous n'avez pas accès à ce modèle"))
|
216 |
|
|
|
250 |
|
251 |
args = parser.parse_args()
|
252 |
|
253 |
+
# Load environment variables
|
254 |
load_dotenv()
|
255 |
|
256 |
print(info(f"=== Vérification de Token Hugging Face - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n"))
|
257 |
|
258 |
+
# Check token directly via inference API
|
259 |
token = args.token or os.environ.get("HF_TOKEN")
|
260 |
token_info = check_token_via_inference_api(token, args.verbose)
|
261 |
|
262 |
+
# If token is valid and we were asked to test a model
|
263 |
if token_info["is_valid"]:
|
264 |
if args.test_model:
|
265 |
check_model_access(token, args.test_model, args.verbose)
|
frontend/src/components/Evaluation/Display.jsx
CHANGED
@@ -20,7 +20,7 @@ import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
|
20 |
import CheckCircleIcon from "@mui/icons-material/CheckCircle";
|
21 |
import ErrorDisplay from "../common/ErrorDisplay";
|
22 |
|
23 |
-
// Styles
|
24 |
const MEDAL_STYLES = {
|
25 |
1: {
|
26 |
color: "#B58A1B",
|
@@ -48,7 +48,7 @@ const MEDAL_STYLES = {
|
|
48 |
},
|
49 |
};
|
50 |
|
51 |
-
//
|
52 |
const getMedalStyle = (rank) => {
|
53 |
if (rank <= 3) {
|
54 |
const medalStyle = MEDAL_STYLES[rank];
|
@@ -72,7 +72,7 @@ const getMedalStyle = (rank) => {
|
|
72 |
marginRight: "8px",
|
73 |
};
|
74 |
}
|
75 |
-
//
|
76 |
return {
|
77 |
color: "text.primary",
|
78 |
fontWeight: rank <= 10 ? 600 : 400,
|
|
|
20 |
import CheckCircleIcon from "@mui/icons-material/CheckCircle";
|
21 |
import ErrorDisplay from "../common/ErrorDisplay";
|
22 |
|
23 |
+
// Styles for medals
|
24 |
const MEDAL_STYLES = {
|
25 |
1: {
|
26 |
color: "#B58A1B",
|
|
|
48 |
},
|
49 |
};
|
50 |
|
51 |
+
// Function to get medal style based on rank
|
52 |
const getMedalStyle = (rank) => {
|
53 |
if (rank <= 3) {
|
54 |
const medalStyle = MEDAL_STYLES[rank];
|
|
|
72 |
marginRight: "8px",
|
73 |
};
|
74 |
}
|
75 |
+
// For ranks > 3, same dimensions but transparent
|
76 |
return {
|
77 |
color: "text.primary",
|
78 |
fontWeight: rank <= 10 ? 600 : 400,
|
frontend/src/components/Evaluation/hooks/useSimulation.js
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import { useState, useRef, useEffect } from "react";
|
2 |
|
3 |
// Simulation time in milliseconds for pre-calculated documents
|
4 |
-
const SIMULATION_DURATION =
|
5 |
-
const STEP_DURATION = SIMULATION_DURATION / 5; //
|
6 |
|
7 |
// Starting messages with their timing
|
8 |
const STARTING_MESSAGES = [
|
@@ -19,15 +19,15 @@ export const useSimulation = (onComplete, shouldStart = false) => {
|
|
19 |
const timeoutsRef = useRef([]);
|
20 |
const hasInitializedRef = useRef(false);
|
21 |
|
22 |
-
//
|
23 |
useEffect(() => {
|
24 |
if (!shouldStart || hasInitializedRef.current) return;
|
25 |
|
26 |
-
//
|
27 |
hasInitializedRef.current = true;
|
28 |
console.log("Simulation starting with shouldStart =", shouldStart);
|
29 |
|
30 |
-
//
|
31 |
for (let i = 1; i < STARTING_MESSAGES.length; i++) {
|
32 |
const timeout = setTimeout(() => {
|
33 |
console.log(`Setting message index to ${i}`);
|
@@ -49,7 +49,7 @@ export const useSimulation = (onComplete, shouldStart = false) => {
|
|
49 |
timeoutsRef.current.push(completeTimeout);
|
50 |
|
51 |
return () => {
|
52 |
-
//
|
53 |
timeoutsRef.current.forEach(clearTimeout);
|
54 |
};
|
55 |
}, [shouldStart, onComplete]);
|
|
|
1 |
import { useState, useRef, useEffect } from "react";
|
2 |
|
3 |
// Simulation time in milliseconds for pre-calculated documents
|
4 |
+
const SIMULATION_DURATION = 7000; // Duration in milliseconds
|
5 |
+
const STEP_DURATION = SIMULATION_DURATION / 5; // Duration of each step
|
6 |
|
7 |
// Starting messages with their timing
|
8 |
const STARTING_MESSAGES = [
|
|
|
19 |
const timeoutsRef = useRef([]);
|
20 |
const hasInitializedRef = useRef(false);
|
21 |
|
22 |
+
// Effect to start simulation if shouldStart is true
|
23 |
useEffect(() => {
|
24 |
if (!shouldStart || hasInitializedRef.current) return;
|
25 |
|
26 |
+
// Mark as initialized
|
27 |
hasInitializedRef.current = true;
|
28 |
console.log("Simulation starting with shouldStart =", shouldStart);
|
29 |
|
30 |
+
// Schedule sequential timeouts for each step
|
31 |
for (let i = 1; i < STARTING_MESSAGES.length; i++) {
|
32 |
const timeout = setTimeout(() => {
|
33 |
console.log(`Setting message index to ${i}`);
|
|
|
49 |
timeoutsRef.current.push(completeTimeout);
|
50 |
|
51 |
return () => {
|
52 |
+
// Clean up all timeouts on unmount
|
53 |
timeoutsRef.current.forEach(clearTimeout);
|
54 |
};
|
55 |
}, [shouldStart, onComplete]);
|
frontend/src/hooks/useDevShortcuts.js
CHANGED
@@ -2,10 +2,10 @@ import { useEffect } from "react";
|
|
2 |
import { useNavigate } from "react-router-dom";
|
3 |
|
4 |
/**
|
5 |
-
* Hook
|
6 |
*
|
7 |
-
* @param {Object} options - Options
|
8 |
-
* @param {string} options.sessionId -
|
9 |
* @returns {void}
|
10 |
*/
|
11 |
const useDevShortcuts = ({ sessionId = null } = {}) => {
|
@@ -13,18 +13,18 @@ const useDevShortcuts = ({ sessionId = null } = {}) => {
|
|
13 |
|
14 |
useEffect(() => {
|
15 |
const handleKeyDown = (e) => {
|
16 |
-
//
|
17 |
if (e.key === "p") {
|
18 |
console.log("Debug key pressed: Clearing auth data and refreshing");
|
19 |
localStorage.removeItem("hf_oauth");
|
20 |
localStorage.removeItem("auth_return_to");
|
21 |
-
//
|
22 |
alert("Auth data cleared. Page will reload.");
|
23 |
-
//
|
24 |
window.location.reload();
|
25 |
}
|
26 |
|
27 |
-
//
|
28 |
if (e.key === "d" && sessionId) {
|
29 |
console.log("Debug key pressed: Showing BenchmarkDisplay");
|
30 |
navigate(`/benchmark-display?session=${sessionId}`);
|
|
|
2 |
import { useNavigate } from "react-router-dom";
|
3 |
|
4 |
/**
|
5 |
+
* Hook for developer keyboard shortcuts
|
6 |
*
|
7 |
+
* @param {Object} options - Options for the hook
|
8 |
+
* @param {string} options.sessionId - Current session ID (if available)
|
9 |
* @returns {void}
|
10 |
*/
|
11 |
const useDevShortcuts = ({ sessionId = null } = {}) => {
|
|
|
13 |
|
14 |
useEffect(() => {
|
15 |
const handleKeyDown = (e) => {
|
16 |
+
// Shortcut 'p' - clear authentication data and reload
|
17 |
if (e.key === "p") {
|
18 |
console.log("Debug key pressed: Clearing auth data and refreshing");
|
19 |
localStorage.removeItem("hf_oauth");
|
20 |
localStorage.removeItem("auth_return_to");
|
21 |
+
// Show a brief message
|
22 |
alert("Auth data cleared. Page will reload.");
|
23 |
+
// Reload the page
|
24 |
window.location.reload();
|
25 |
}
|
26 |
|
27 |
+
// Shortcut 'd' - go directly to benchmark display
|
28 |
if (e.key === "d" && sessionId) {
|
29 |
console.log("Debug key pressed: Showing BenchmarkDisplay");
|
30 |
navigate(`/benchmark-display?session=${sessionId}`);
|
frontend/src/pages/BenchmarkGenerationPage.jsx
CHANGED
@@ -21,8 +21,8 @@ function BenchmarkGenerationPage() {
|
|
21 |
const handleGenerationComplete = (result) => {
|
22 |
console.log("Benchmark generation completed:", result);
|
23 |
if (result && result.success && !hasRedirectedRef.current) {
|
24 |
-
hasRedirectedRef.current = true; //
|
25 |
-
//
|
26 |
setTimeout(() => {
|
27 |
navigate(`/benchmark-display?session=${sessionId}`);
|
28 |
}, 500);
|
|
|
21 |
const handleGenerationComplete = (result) => {
|
22 |
console.log("Benchmark generation completed:", result);
|
23 |
if (result && result.success && !hasRedirectedRef.current) {
|
24 |
+
hasRedirectedRef.current = true; // Mark that redirection has been done
|
25 |
+
// Short pause before navigating to avoid synchronization issues
|
26 |
setTimeout(() => {
|
27 |
navigate(`/benchmark-display?session=${sessionId}`);
|
28 |
}, 500);
|
frontend/src/pages/EvaluationDisplayPage.jsx
CHANGED
@@ -18,14 +18,14 @@ function EvaluationDisplayPage() {
|
|
18 |
const { mode } = useThemeMode();
|
19 |
const theme = getTheme(mode);
|
20 |
|
21 |
-
//
|
22 |
const baseDocuments = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"];
|
23 |
const isBaseDocument = baseDocuments.includes(sessionId);
|
24 |
|
25 |
useEffect(() => {
|
26 |
if (!sessionId) {
|
27 |
console.log(
|
28 |
-
"Session ID
|
29 |
);
|
30 |
setIsValidSession(false);
|
31 |
return;
|
@@ -33,20 +33,20 @@ function EvaluationDisplayPage() {
|
|
33 |
|
34 |
const fetchEvaluationResults = async () => {
|
35 |
try {
|
36 |
-
//
|
37 |
const sessionCheckResponse = await fetch(
|
38 |
`${API_CONFIG.BASE_URL}/benchmark-questions/${sessionId}`
|
39 |
);
|
40 |
|
41 |
if (!sessionCheckResponse.ok) {
|
42 |
console.error(
|
43 |
-
`
|
44 |
);
|
45 |
setIsValidSession(false);
|
46 |
return;
|
47 |
}
|
48 |
|
49 |
-
//
|
50 |
const evalResponse = await fetch(
|
51 |
`${API_CONFIG.BASE_URL}/evaluation-results/${sessionId}`
|
52 |
);
|
@@ -77,14 +77,14 @@ function EvaluationDisplayPage() {
|
|
77 |
fetchEvaluationResults();
|
78 |
}, [sessionId]);
|
79 |
|
80 |
-
//
|
81 |
useEffect(() => {
|
82 |
-
//
|
83 |
if (isBaseDocument || isLoading || !evaluationResults) {
|
84 |
return;
|
85 |
}
|
86 |
|
87 |
-
//
|
88 |
const cleanupSession = async () => {
|
89 |
try {
|
90 |
const response = await fetch(
|
@@ -104,12 +104,12 @@ function EvaluationDisplayPage() {
|
|
104 |
}
|
105 |
};
|
106 |
|
107 |
-
//
|
108 |
const cleanupTimeout = setTimeout(() => {
|
109 |
cleanupSession();
|
110 |
}, 2000);
|
111 |
|
112 |
-
//
|
113 |
return () => clearTimeout(cleanupTimeout);
|
114 |
}, [sessionId, isBaseDocument, isLoading, evaluationResults]);
|
115 |
|
|
|
18 |
const { mode } = useThemeMode();
|
19 |
const theme = getTheme(mode);
|
20 |
|
21 |
+
// List of base documents that should not be deleted
|
22 |
const baseDocuments = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"];
|
23 |
const isBaseDocument = baseDocuments.includes(sessionId);
|
24 |
|
25 |
useEffect(() => {
|
26 |
if (!sessionId) {
|
27 |
console.log(
|
28 |
+
"Session ID missing for displaying results, redirecting to home"
|
29 |
);
|
30 |
setIsValidSession(false);
|
31 |
return;
|
|
|
33 |
|
34 |
const fetchEvaluationResults = async () => {
|
35 |
try {
|
36 |
+
// First check if the session exists
|
37 |
const sessionCheckResponse = await fetch(
|
38 |
`${API_CONFIG.BASE_URL}/benchmark-questions/${sessionId}`
|
39 |
);
|
40 |
|
41 |
if (!sessionCheckResponse.ok) {
|
42 |
console.error(
|
43 |
+
`Invalid session or server error: ${sessionCheckResponse.status}`
|
44 |
);
|
45 |
setIsValidSession(false);
|
46 |
return;
|
47 |
}
|
48 |
|
49 |
+
// Retrieve evaluation results
|
50 |
const evalResponse = await fetch(
|
51 |
`${API_CONFIG.BASE_URL}/evaluation-results/${sessionId}`
|
52 |
);
|
|
|
77 |
fetchEvaluationResults();
|
78 |
}, [sessionId]);
|
79 |
|
80 |
+
// Effect to clean up the session folder after displaying results
|
81 |
useEffect(() => {
|
82 |
+
// Do not clean up if it's a base document or if results are not yet loaded
|
83 |
if (isBaseDocument || isLoading || !evaluationResults) {
|
84 |
return;
|
85 |
}
|
86 |
|
87 |
+
// Function to clean up the session folder
|
88 |
const cleanupSession = async () => {
|
89 |
try {
|
90 |
const response = await fetch(
|
|
|
104 |
}
|
105 |
};
|
106 |
|
107 |
+
// Call the function after a delay to ensure the user had time to see the results
|
108 |
const cleanupTimeout = setTimeout(() => {
|
109 |
cleanupSession();
|
110 |
}, 2000);
|
111 |
|
112 |
+
// Clean up the timeout if component is unmounted
|
113 |
return () => clearTimeout(cleanupTimeout);
|
114 |
}, [sessionId, isBaseDocument, isLoading, evaluationResults]);
|
115 |
|