| import logging |
| import re |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from typing import Dict, List, Union |
| from urllib.parse import urljoin, urlparse |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
|
|
| def get_base_url(url: str) -> str: |
| """ |
| Extracts the base URL from a given URL. |
| |
| Parameters: |
| - url (str): The URL to extract the base URL from. |
| |
| Returns: |
| - str: The base URL. |
| """ |
| parsed_url = urlparse(url) |
| base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" |
| return base_url |
|
|
|
|
| def get_domain_name(url: str) -> str: |
| """ |
| Get the domain name from a URL. |
| |
| Args: |
| url (str): The URL. |
| |
| Returns: |
| str: The domain name. |
| """ |
|
|
| parsed_uri = urlparse(url) |
| domain = "{uri.netloc}".format(uri=parsed_uri) |
| if domain.startswith("www."): |
| domain = domain[4:] |
|
|
| |
| domain = ".".join(domain.split(".")[:-1]) |
| |
| return domain.capitalize() |
|
|
|
|
| def get_favicon(url: str) -> str: |
| headers = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" |
| } |
| try: |
| response = requests.get(url, headers=headers, timeout=2) |
| if response.status_code == 200: |
| soup = BeautifulSoup(response.content, "html.parser") |
| |
| icon_links = soup.find_all( |
| "link", rel=re.compile(r"(shortcut icon|icon|apple-touch-icon)", re.I) |
| ) |
| meta_icons = soup.find_all( |
| "meta", attrs={"content": re.compile(r".ico$", re.I)} |
| ) |
| icons = icon_links + meta_icons |
|
|
| if icons: |
| for icon in icons: |
| favicon_url = icon.get("href") or icon.get("content") |
| if favicon_url: |
| if favicon_url.startswith("/"): |
| favicon_url = urljoin(url, favicon_url) |
| return favicon_url |
| |
| return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| else: |
| |
| return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| else: |
| |
| return ( |
| "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| ) |
| except requests.Timeout: |
| logging.warning(f"Request timed out for {url}") |
| return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| except Exception as e: |
| logging.warning(f"An error occurred while fetching favicon for {url}: {e}") |
| return "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
|
|
|
|
| def download_favicons(urls: List[str]) -> Dict[str, str]: |
| favicons = {} |
| urls = list(set(urls)) |
| with ThreadPoolExecutor(max_workers=20) as executor: |
| future_to_url = {executor.submit(get_favicon, url): url for url in urls} |
| for future in as_completed(future_to_url): |
| url = future_to_url[future] |
| try: |
| favicon_url = future.result() |
| favicons[url] = favicon_url |
| except Exception as e: |
| logging.warning(f"Failed to fetch favicon for {url}: {e}") |
| favicons[url] = ( |
| "https://upload.wikimedia.org/wikipedia/commons/0/01/Website_icon.svg" |
| ) |
| return favicons |
|
|
|
|
| def url_exists(url): |
| """ |
| Checks if a URL exists by making a HEAD request. |
| |
| Parameters: |
| - url (str): The URL to check. |
| |
| Returns: |
| - bool: True if the URL exists, False otherwise. |
| """ |
| try: |
| response = requests.head(url, allow_redirects=True) |
| return response.status_code < 400 |
| except requests.RequestException: |
| |
| return False |
|
|
|
|
| def build_dataset_url(dataset_name: str): |
| """ |
| Build an HTML string with the dataset URL. |
| """ |
| url = f"https://huggingface.co/datasets/{dataset_name}" |
| |
| if url_exists(url): |
| return url |
| else: |
| return None |
|
|
|
|
| def build_model_url(model_name: str): |
| """ |
| Build an HTML string with the model URL. |
| """ |
| url = f"https://huggingface.co/{model_name}" |
| |
| if url_exists(url): |
| return url |
| else: |
| return None |
|
|
|
|
| def build_text_icon(text: str, url: Union[str, None], icon_url: str): |
| if url is not None: |
| return ( |
| f'<a href="{url}" target="_blank" style="text-decoration: none; color: inherit; display: inline-flex; align-items: center;">' |
| f'<img src="{icon_url}" alt="{url}" style="display: inline-block; vertical-align: middle; margin-right: 4px;" width="16" height="16">' |
| f'<span style="display: inline-block; vertical-align: middle;">{text}</span> </a>' |
| ) |
| else: |
| return text |
|
|
|
|
| def build_datasets_urls(datasets_names: List[str]) -> Dict[str, str]: |
| """ |
| Build a dictionary of dataset URLs from a list of dataset names. |
| |
| Parameters: |
| - datasets_names (List[str]): The list of dataset names. |
| |
| Returns: |
| - Dict[str, str]: A dictionary of dataset URLs. |
| """ |
| return {dataset: build_dataset_url(dataset) for dataset in datasets_names} |
|
|
|
|
| def build_models_urls(models_names: List[str]) -> Dict[str, str]: |
| """ |
| Build a dictionary of model URLs from a list of model names. |
| |
| Parameters: |
| - models_names (List[str]): The list of model names. |
| |
| Returns: |
| - Dict[str, str]: A dictionary of model URLs. |
| """ |
| return {model: build_model_url(model) for model in models_names} |
|
|