Spaces:

wozwize
/

media-unmasked-api

Sleeping

File size: 5,811 Bytes

876b12f

from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

from ..utils.logging_config import setup_logging

class ArticleScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        setup_logging()
        self.logger = logging.getLogger(__name__)

    def _get_domain(self, url: str) -> str:
        """Extract domain from URL."""
        return urlparse(url).netloc

    def _fetch_page(self, url: str) -> Optional[str]:
        """Fetch page content with error handling."""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            return response.text
            
        except Exception as e:
            self.logger.error(f"Error fetching {url}: {str(e)}")
            return None

    def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extract content from Snopes articles."""
        # Get headline from any h1 tag since it doesn't have a specific class
        headline_elem = soup.find('h1')
        headline = headline_elem.get_text().strip() if headline_elem else ''
        self.logger.info(f"Found headline: {headline}")
        
        # Try to find the article content
        article = soup.find('article')
        if article:
            self.logger.info("Found article tag")
            # Remove unwanted elements
            for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
                unwanted.decompose()
            
            # Get all paragraphs from the article
            paragraphs = article.find_all('p')
            if paragraphs:
                content = ' '.join(p.get_text().strip() for p in paragraphs)
            else:
                content = article.get_text().strip()
        else:
            self.logger.warning("No article tag found")
            content = ''
            
        return {"headline": headline, "content": content}

    def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extract content from PolitiFact articles."""
        try:
            headline = soup.find('h1', class_='article__title')
            if headline:
                headline = headline.get_text().strip()
            else:
                headline = soup.find('h1')
                headline = headline.get_text().strip() if headline else "No headline found"
            
            self.logger.info(f"Found headline: {headline}")
            
            content_div = soup.find('article', class_='article')
            if content_div:
                # Remove unwanted elements
                for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
                    unwanted.decompose()
                content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
            else:
                # Try alternative content selectors
                content_selectors = ['.article__text', '.m-textblock']
                content = ''
                for selector in content_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
                        break
                    
            if not content:
                self.logger.warning("No content found in article")
                content = "No content found"
            
            return {"headline": headline, "content": content}
            
        except Exception as e:
            self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
            return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}

    def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
        """
        Main function to scrape fact-checking articles.
        Returns a dictionary with headline and content.
        """
        html_content = self._fetch_page(url)
        if not html_content:
            self.logger.error("Failed to fetch page content")
            return None

        soup = BeautifulSoup(html_content, 'html.parser')
        domain = self._get_domain(url)
        
        self.logger.info(f"Scraping article from domain: {domain}")

        # Select appropriate extractor based on domain
        if 'snopes.com' in domain:
            result = self._extract_snopes(soup)
            if not result['headline'] or not result['content']:
                self.logger.warning("Failed to extract content from Snopes article")
                self.logger.debug(f"HTML content: {html_content[:500]}...")
            return result
        elif 'politifact.com' in domain:
            return self._extract_politifact(soup)
        else:
            # Generic extraction fallback
            headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
            
            # Try common content selectors
            content_selectors = ['article', 'main', '.content', '.article-content']
            content = ''
            
            for selector in content_selectors:
                content_div = soup.select_one(selector)
                if content_div:
                    # Remove unwanted elements
                    for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
                        unwanted.decompose()
                    content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
                    break

            return {"headline": headline, "content": content}