Spaces:
Sleeping
Sleeping
File size: 4,721 Bytes
5923f2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import requests
import yaml
import os
import re
import asyncio
import aiohttp
import pandas as pd
from tqdm import tqdm
def get_audio_models():
url = "https://raw.githubusercontent.com/huggingface/transformers/main/docs/source/en/_toctree.yml"
response = requests.get(url)
if response.status_code != 200:
print("Failed to fetch the YAML file")
return []
toctree_content = yaml.safe_load(response.text)
for section in toctree_content:
if section.get('title') == 'API':
for subsection in section.get('sections', []):
if subsection.get('title') == 'Models':
for model_section in subsection.get('sections', []):
if model_section.get('title') == 'Audio models':
return [audio_model.get('local').split('/')[-1].lower().replace('-', '_') for audio_model in model_section.get('sections', []) if 'local' in audio_model]
return []
def fetch_and_process_ci_results(job_id):
github_token = os.environ.get('GITHUB_TOKEN')
if not github_token:
raise ValueError("GitHub token not found in environment variables")
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github+json"
}
audio_models = get_audio_models()
non_tested_models = [
"xls_r",
"speech_to_text_2",
"mctct",
"xlsr_wav2vec2",
"mms"
]
url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{job_id}/jobs"
audio_model_jobs = {audio_model: [] for audio_model in audio_models}
def process_jobs(jobs_data):
for job in jobs_data['jobs']:
if "Model CI" in job['name'] and "models" in job['name']:
match = re.search(r'models/([^/)]+)', job['name'])
if match:
model_name = match.group(1).lower()
if model_name in audio_model_jobs:
audio_model_jobs[model_name].append(job['id'])
async def fetch_and_process_jobs(session, url):
async with session.get(url, headers=headers) as response:
jobs_data = await response.json()
process_jobs(jobs_data)
return response.links.get('next', {}).get('url')
async def fetch_all_jobs():
async with aiohttp.ClientSession() as session:
next_url = url
with tqdm(desc="Fetching jobs", unit="page") as pbar:
while next_url:
next_url = await fetch_and_process_jobs(session, next_url)
pbar.update(1)
def parse_test_results(text):
pattern = r'=+ (?:(\d+) failed,?\s*)?(?:(\d+) passed,?\s*)?(?:(\d+) skipped,?\s*)?(?:\d+ warnings?\s*)?in \d+\.\d+s'
match = re.search(pattern, text)
if match:
failed = int(match.group(1)) if match.group(1) else 0
passed = int(match.group(2)) if match.group(2) else 0
skipped = int(match.group(3)) if match.group(3) else 0
return {'failed': failed, 'passed': passed, 'skipped': skipped}
raise Exception("Could not find test summary in logs")
def retrieve_job_logs(job_id, job_name):
url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}"
response = requests.get(url, headers=headers)
logs_url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}/logs"
logs_response = requests.get(logs_url, headers=headers)
logs = logs_response.text
test_summary = parse_test_results(logs)
test_summary["model"] = job_name
test_summary["conclusion"] = response.json()['conclusion']
return test_summary
# Fetch initial jobs and run asynchronous job fetching
response = requests.get(url, headers=headers)
jobs = response.json()
process_jobs(jobs)
asyncio.run(fetch_all_jobs())
# Retrieve job logs and process results
results = []
for job_name, job_ids in tqdm(audio_model_jobs.items()):
for job_id in job_ids:
result = retrieve_job_logs(job_id, job_name)
results.append(result)
# Process results into DataFrame and save to CSV
df = (pd.DataFrame(results)
.melt(id_vars=['model', 'conclusion'],
value_vars=['failed', 'passed', 'skipped'],
var_name='test_type',
value_name='number_of_tests')
.groupby(['model', 'conclusion', 'test_type'])
.agg({'number_of_tests': 'sum'})
.reset_index())
df.to_csv('test_results_by_type.csv', index=False) |