import requests import yaml import os import re import asyncio import aiohttp import pandas as pd from tqdm import tqdm def get_audio_models(): url = "https://raw.githubusercontent.com/huggingface/transformers/main/docs/source/en/_toctree.yml" response = requests.get(url) if response.status_code != 200: print("Failed to fetch the YAML file") return [] toctree_content = yaml.safe_load(response.text) for section in toctree_content: if section.get('title') == 'API': for subsection in section.get('sections', []): if subsection.get('title') == 'Models': for model_section in subsection.get('sections', []): if model_section.get('title') == 'Audio models': return [audio_model.get('local').split('/')[-1].lower().replace('-', '_') for audio_model in model_section.get('sections', []) if 'local' in audio_model] return [] def fetch_and_process_ci_results(job_id): github_token = os.environ.get('GITHUB_TOKEN') if not github_token: raise ValueError("GitHub token not found in environment variables") headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github+json" } audio_models = get_audio_models() non_tested_models = [ "xls_r", "speech_to_text_2", "mctct", "xlsr_wav2vec2", "mms" ] url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{job_id}/jobs" audio_model_jobs = {audio_model: [] for audio_model in audio_models} def process_jobs(jobs_data): for job in jobs_data['jobs']: if "Model CI" in job['name'] and "models" in job['name']: match = re.search(r'models/([^/)]+)', job['name']) if match: model_name = match.group(1).lower() if model_name in audio_model_jobs: audio_model_jobs[model_name].append(job['id']) async def fetch_and_process_jobs(session, url): async with session.get(url, headers=headers) as response: jobs_data = await response.json() process_jobs(jobs_data) return response.links.get('next', {}).get('url') async def fetch_all_jobs(): async with aiohttp.ClientSession() as session: next_url = url with tqdm(desc="Fetching jobs", unit="page") as pbar: while next_url: next_url = await fetch_and_process_jobs(session, next_url) pbar.update(1) def parse_test_results(text): pattern = r'=+ (?:(\d+) failed,?\s*)?(?:(\d+) passed,?\s*)?(?:(\d+) skipped,?\s*)?(?:\d+ warnings?\s*)?in \d+\.\d+s' match = re.search(pattern, text) if match: failed = int(match.group(1)) if match.group(1) else 0 passed = int(match.group(2)) if match.group(2) else 0 skipped = int(match.group(3)) if match.group(3) else 0 return {'failed': failed, 'passed': passed, 'skipped': skipped} raise Exception("Could not find test summary in logs") def retrieve_job_logs(job_id, job_name): url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}" response = requests.get(url, headers=headers) logs_url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}/logs" logs_response = requests.get(logs_url, headers=headers) logs = logs_response.text test_summary = parse_test_results(logs) test_summary["model"] = job_name test_summary["conclusion"] = response.json()['conclusion'] return test_summary # Fetch initial jobs and run asynchronous job fetching response = requests.get(url, headers=headers) jobs = response.json() process_jobs(jobs) asyncio.run(fetch_all_jobs()) # Retrieve job logs and process results results = [] for job_name, job_ids in tqdm(audio_model_jobs.items()): for job_id in job_ids: result = retrieve_job_logs(job_id, job_name) results.append(result) # Process results into DataFrame and save to CSV df = (pd.DataFrame(results) .melt(id_vars=['model', 'conclusion'], value_vars=['failed', 'passed', 'skipped'], var_name='test_type', value_name='number_of_tests') .groupby(['model', 'conclusion', 'test_type']) .agg({'number_of_tests': 'sum'}) .reset_index()) df.to_csv('test_results_by_type.csv', index=False)