| import json |
| import os |
|
|
| import filelock |
| import huggingface_hub |
| import pandas as pd |
|
|
| from utils import ( |
| build_datasets_urls, |
| build_models_urls, |
| build_text_icon, |
| download_favicons, |
| get_base_url, |
| get_domain_name, |
| ) |
|
|
|
|
| HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg" |
| CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png" |
|
|
| DISABLE_ONLINE_CACHE = False |
| ONLINE_CACHE = "CONDA-Workshop/RequestCache" |
|
|
|
|
| def save_cache(cache_data, cache_file, initial_timestamp): |
| print(f"Saving cache to {cache_file}") |
| |
| with filelock.FileLock(f"{cache_file}.lock"): |
| |
| current_timestamp = ( |
| os.path.getmtime(cache_file) if os.path.exists(cache_file) else None |
| ) |
| if current_timestamp is None or initial_timestamp != current_timestamp: |
| |
| try: |
| with open(cache_file, "r", encoding="utf8") as f: |
| |
| cache_dict = json.load(f) |
| |
| if cache_dict != cache_data: |
| cache_data.update(cache_dict) |
|
|
| except FileNotFoundError: |
| pass |
|
|
| |
| with open(cache_file, "w", encoding="utf8") as f: |
| json.dump(cache_data, f, ensure_ascii=False, indent=4) |
|
|
| if not DISABLE_ONLINE_CACHE: |
| try: |
| huggingface_hub.upload_file( |
| repo_id=ONLINE_CACHE, |
| repo_type="dataset", |
| token=os.environ.get("TOKEN") or True, |
| path_in_repo=cache_file, |
| path_or_fileobj=cache_file, |
| ) |
| except Exception as e: |
| print(f"Unable to upload {cache_file}: {e}") |
|
|
| return cache_data |
|
|
|
|
| def update_favicon_cache(sources): |
| |
| favicon_dict = {} |
| favicon_file_path = "favicons.json" |
| initial_timestamp = None |
|
|
| if not DISABLE_ONLINE_CACHE: |
| try: |
| huggingface_hub.hf_hub_download( |
| repo_id=ONLINE_CACHE, |
| repo_type="dataset", |
| token=os.environ.get("TOKEN") or True, |
| filename=favicon_file_path, |
| local_dir=os.getcwd(), |
| ) |
| except Exception as e: |
| print(f"Unable to download favicons.json: {e}") |
|
|
| |
| if os.path.exists(favicon_file_path): |
| initial_timestamp = os.path.getmtime(favicon_file_path) |
| try: |
| with open(favicon_file_path, "r", encoding="utf8") as f: |
| favicon_dict = json.load(f) |
| except FileNotFoundError: |
| pass |
|
|
| |
| missing_domains = [domain for domain in sources if domain not in favicon_dict] |
|
|
| |
| if missing_domains: |
| new_favicon_urls = download_favicons(missing_domains) |
| favicon_dict.update(new_favicon_urls) |
| favicon_dict = save_cache( |
| cache_data=favicon_dict, |
| cache_file=favicon_file_path, |
| initial_timestamp=initial_timestamp, |
| ) |
|
|
| return favicon_dict |
|
|
|
|
| def update_model_url_cache(models): |
| models = [x for x in models if x is not None] |
| models = list(set(models)) |
|
|
| |
| model_url_dict = {} |
| model_url_file_path = "model_urls.json" |
| initial_timestamp = None |
|
|
| if not DISABLE_ONLINE_CACHE: |
| try: |
| huggingface_hub.hf_hub_download( |
| repo_id=ONLINE_CACHE, |
| repo_type="dataset", |
| token=os.environ.get("TOKEN") or True, |
| filename=model_url_file_path, |
| local_dir=os.getcwd(), |
| ) |
| except Exception as e: |
| print(f"Unable to download model_urls.json: {e}") |
|
|
| |
| if os.path.exists(model_url_file_path): |
| initial_timestamp = os.path.getmtime(model_url_file_path) |
| try: |
| with open(model_url_file_path, "r", encoding="utf8") as f: |
| model_url_dict = json.load(f) |
| except FileNotFoundError: |
| pass |
|
|
| |
| missing_model_urls = [model for model in models if model not in model_url_dict] |
|
|
| |
| if missing_model_urls: |
| new_model_urls = build_models_urls(missing_model_urls) |
| model_url_dict.update(new_model_urls) |
| model_url_dict = save_cache( |
| cache_data=model_url_dict, |
| cache_file=model_url_file_path, |
| initial_timestamp=initial_timestamp, |
| ) |
|
|
| return model_url_dict |
|
|
|
|
| def update_dataset_url_cache(datasets): |
| datasets = [x for x in datasets if x is not None] |
| datasets = list(set(datasets)) |
|
|
| |
| dataset_url_dict = {} |
| dataset_url_file_path = "dataset_urls.json" |
| initial_timestamp = None |
|
|
| if not DISABLE_ONLINE_CACHE: |
| try: |
| huggingface_hub.hf_hub_download( |
| repo_id=ONLINE_CACHE, |
| repo_type="dataset", |
| token=os.environ.get("TOKEN") or True, |
| filename=dataset_url_file_path, |
| local_dir=os.getcwd(), |
| ) |
| except Exception as e: |
| print(f"Unable to download dataset_urls.json: {e}") |
|
|
| |
| if os.path.exists(dataset_url_file_path): |
| initial_timestamp = os.path.getmtime(dataset_url_file_path) |
| try: |
| with open(dataset_url_file_path, "r", encoding="utf8") as f: |
| dataset_url_dict = json.load(f) |
| except FileNotFoundError: |
| pass |
|
|
| |
| missing_dataset_urls = [ |
| dataset for dataset in datasets if dataset not in dataset_url_dict |
| ] |
|
|
| |
| if missing_dataset_urls: |
| new_dataset_urls = build_datasets_urls(missing_dataset_urls) |
| dataset_url_dict.update(new_dataset_urls) |
| dataset_url_dict = save_cache( |
| cache_data=dataset_url_dict, |
| cache_file=dataset_url_file_path, |
| initial_timestamp=initial_timestamp, |
| ) |
|
|
| return dataset_url_dict |
|
|
|
|
| def get_dataframe(): |
| |
| data = pd.read_csv("contamination_report.csv", delimiter=";", header=0) |
|
|
| |
| favicon_dict = {} |
|
|
| |
| favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]]) |
|
|
| |
| model_url_dict = update_model_url_cache( |
| data[data["Model or corpus"] == "model"]["Contaminated Source"] |
| ) |
|
|
| |
| dataset_url_dict = update_dataset_url_cache( |
| list(data["Evaluation Dataset"]) |
| + list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"]) |
| ) |
|
|
| |
| data["Reference"] = data["Reference"].apply( |
| lambda x: build_text_icon( |
| text=get_domain_name(x), |
| url=x, |
| icon_url=favicon_dict.get(get_base_url(x), ""), |
| ) |
| ) |
|
|
| PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}" |
| data["PR"] = data["PR"].apply( |
| lambda x: build_text_icon( |
| text="", |
| url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link", |
| icon_url=HF_ICON if x == x else CROSS_ICON, |
| ) |
| ) |
|
|
| data["Evaluation Dataset"] = data["Evaluation Dataset"].apply( |
| lambda x: build_text_icon( |
| text=x, |
| url=dataset_url_dict.get(x, ""), |
| icon_url=HF_ICON, |
| ) |
| ) |
|
|
| data["Evaluation Dataset"] = data.apply( |
| lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"], |
| axis=1, |
| ) |
|
|
| del data["Subset"] |
|
|
| |
| data["Contaminated Source"] = data.apply( |
| lambda x: build_text_icon( |
| text=x["Contaminated Source"], |
| url=dataset_url_dict.get(x["Contaminated Source"], "") |
| if x["Model or corpus"] == "corpus" |
| else model_url_dict.get(x["Contaminated Source"], ""), |
| icon_url=HF_ICON, |
| ), |
| axis=1, |
| ) |
|
|
| data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x) |
| data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x) |
| data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x) |
|
|
| return data |
|
|