File size: 4,721 Bytes
5923f2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import yaml
import os
import re
import asyncio
import aiohttp
import pandas as pd
from tqdm import tqdm

def get_audio_models():
    url = "https://raw.githubusercontent.com/huggingface/transformers/main/docs/source/en/_toctree.yml"
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Failed to fetch the YAML file")
        return []

    toctree_content = yaml.safe_load(response.text)
    
    for section in toctree_content:
        if section.get('title') == 'API':
            for subsection in section.get('sections', []):
                if subsection.get('title') == 'Models':
                    for model_section in subsection.get('sections', []):
                        if model_section.get('title') == 'Audio models':
                            return [audio_model.get('local').split('/')[-1].lower().replace('-', '_') for audio_model in model_section.get('sections', []) if 'local' in audio_model]
    
    return []

def fetch_and_process_ci_results(job_id):
    github_token = os.environ.get('GITHUB_TOKEN')
    if not github_token:
        raise ValueError("GitHub token not found in environment variables")

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github+json"
    }

    audio_models = get_audio_models()
    non_tested_models = [
        "xls_r",
        "speech_to_text_2",
        "mctct", 
        "xlsr_wav2vec2",
        "mms"
    ]

    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{job_id}/jobs"

    audio_model_jobs = {audio_model: [] for audio_model in audio_models}

    def process_jobs(jobs_data):
        for job in jobs_data['jobs']:
            if "Model CI" in job['name'] and "models" in job['name']:
                match = re.search(r'models/([^/)]+)', job['name'])
                if match:
                    model_name = match.group(1).lower()
                    if model_name in audio_model_jobs:
                        audio_model_jobs[model_name].append(job['id'])

    async def fetch_and_process_jobs(session, url):
        async with session.get(url, headers=headers) as response:
            jobs_data = await response.json()
            process_jobs(jobs_data)
            return response.links.get('next', {}).get('url')

    async def fetch_all_jobs():
        async with aiohttp.ClientSession() as session:
            next_url = url
            with tqdm(desc="Fetching jobs", unit="page") as pbar:
                while next_url:
                    next_url = await fetch_and_process_jobs(session, next_url)
                    pbar.update(1)

    def parse_test_results(text):
        pattern = r'=+ (?:(\d+) failed,?\s*)?(?:(\d+) passed,?\s*)?(?:(\d+) skipped,?\s*)?(?:\d+ warnings?\s*)?in \d+\.\d+s'
        match = re.search(pattern, text)
        if match:
            failed = int(match.group(1)) if match.group(1) else 0
            passed = int(match.group(2)) if match.group(2) else 0
            skipped = int(match.group(3)) if match.group(3) else 0
            return {'failed': failed, 'passed': passed, 'skipped': skipped}
        raise Exception("Could not find test summary in logs")

    def retrieve_job_logs(job_id, job_name):
        url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}"
        response = requests.get(url, headers=headers)
        logs_url = f"https://api.github.com/repos/huggingface/transformers/actions/jobs/{job_id}/logs"
        logs_response = requests.get(logs_url, headers=headers)
        logs = logs_response.text
        test_summary = parse_test_results(logs)
        test_summary["model"] = job_name
        test_summary["conclusion"] = response.json()['conclusion']
        return test_summary

    # Fetch initial jobs and run asynchronous job fetching
    response = requests.get(url, headers=headers)
    jobs = response.json()
    process_jobs(jobs)
    asyncio.run(fetch_all_jobs())

    # Retrieve job logs and process results
    results = []
    for job_name, job_ids in tqdm(audio_model_jobs.items()):
        for job_id in job_ids:
            result = retrieve_job_logs(job_id, job_name)
            results.append(result)

    # Process results into DataFrame and save to CSV
    df = (pd.DataFrame(results)
                .melt(id_vars=['model', 'conclusion'],
                    value_vars=['failed', 'passed', 'skipped'], 
                    var_name='test_type',
                    value_name='number_of_tests')
                .groupby(['model', 'conclusion', 'test_type'])
                .agg({'number_of_tests': 'sum'})
                .reset_index())

    df.to_csv('test_results_by_type.csv', index=False)