Spaces:
Sleeping
Sleeping
from typing import Dict, Optional | |
import logging | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
from ..utils.logging_config import setup_logging | |
class ArticleScraper: | |
def __init__(self): | |
self.session = requests.Session() | |
self.session.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
setup_logging() | |
self.logger = logging.getLogger(__name__) | |
def _get_domain(self, url: str) -> str: | |
"""Extract domain from URL.""" | |
return urlparse(url).netloc | |
def _fetch_page(self, url: str) -> Optional[str]: | |
"""Fetch page content with error handling.""" | |
try: | |
response = self.session.get(url) | |
response.raise_for_status() | |
return response.text | |
except Exception as e: | |
self.logger.error(f"Error fetching {url}: {str(e)}") | |
return None | |
def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]: | |
"""Extract content from Snopes articles.""" | |
# Get headline from any h1 tag since it doesn't have a specific class | |
headline_elem = soup.find('h1') | |
headline = headline_elem.get_text().strip() if headline_elem else '' | |
self.logger.info(f"Found headline: {headline}") | |
# Try to find the article content | |
article = soup.find('article') | |
if article: | |
self.logger.info("Found article tag") | |
# Remove unwanted elements | |
for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']): | |
unwanted.decompose() | |
# Get all paragraphs from the article | |
paragraphs = article.find_all('p') | |
if paragraphs: | |
content = ' '.join(p.get_text().strip() for p in paragraphs) | |
else: | |
content = article.get_text().strip() | |
else: | |
self.logger.warning("No article tag found") | |
content = '' | |
return {"headline": headline, "content": content} | |
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]: | |
"""Extract content from PolitiFact articles.""" | |
try: | |
headline = soup.find('h1', class_='article__title') | |
if headline: | |
headline = headline.get_text().strip() | |
else: | |
headline = soup.find('h1') | |
headline = headline.get_text().strip() if headline else "No headline found" | |
self.logger.info(f"Found headline: {headline}") | |
content_div = soup.find('article', class_='article') | |
if content_div: | |
# Remove unwanted elements | |
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']): | |
unwanted.decompose() | |
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p')) | |
else: | |
# Try alternative content selectors | |
content_selectors = ['.article__text', '.m-textblock'] | |
content = '' | |
for selector in content_selectors: | |
content_elem = soup.select_one(selector) | |
if content_elem: | |
content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p')) | |
break | |
if not content: | |
self.logger.warning("No content found in article") | |
content = "No content found" | |
return {"headline": headline, "content": content} | |
except Exception as e: | |
self.logger.error(f"Error extracting PolitiFact content: {str(e)}") | |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"} | |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]: | |
""" | |
Main function to scrape fact-checking articles. | |
Returns a dictionary with headline and content. | |
""" | |
html_content = self._fetch_page(url) | |
if not html_content: | |
self.logger.error("Failed to fetch page content") | |
return None | |
soup = BeautifulSoup(html_content, 'html.parser') | |
domain = self._get_domain(url) | |
self.logger.info(f"Scraping article from domain: {domain}") | |
# Select appropriate extractor based on domain | |
if 'snopes.com' in domain: | |
result = self._extract_snopes(soup) | |
if not result['headline'] or not result['content']: | |
self.logger.warning("Failed to extract content from Snopes article") | |
self.logger.debug(f"HTML content: {html_content[:500]}...") | |
return result | |
elif 'politifact.com' in domain: | |
return self._extract_politifact(soup) | |
else: | |
# Generic extraction fallback | |
headline = soup.find('h1').get_text().strip() if soup.find('h1') else '' | |
# Try common content selectors | |
content_selectors = ['article', 'main', '.content', '.article-content'] | |
content = '' | |
for selector in content_selectors: | |
content_div = soup.select_one(selector) | |
if content_div: | |
# Remove unwanted elements | |
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']): | |
unwanted.decompose() | |
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p')) | |
break | |
return {"headline": headline, "content": content} |