Spaces:

wozwize
/

media-unmasked-api

Sleeping

App Files Files Community

media-unmasked-api / mediaunmasked /scrapers /article_scraper.py

wozwize

initial commit of media-unmasked-api to huggingface

876b12f 6 months ago

raw

history blame

5.81 kB

	from typing import Dict, Optional
	import logging
	from urllib.parse import urlparse
	import requests
	from bs4 import BeautifulSoup

	from ..utils.logging_config import setup_logging

	class ArticleScraper:
	def __init__(self):
	self.session = requests.Session()
	self.session.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	setup_logging()
	self.logger = logging.getLogger(__name__)

	def _get_domain(self, url: str) -> str:
	"""Extract domain from URL."""
	return urlparse(url).netloc

	def _fetch_page(self, url: str) -> Optional[str]:
	"""Fetch page content with error handling."""
	try:
	response = self.session.get(url)
	response.raise_for_status()
	return response.text

	except Exception as e:
	self.logger.error(f"Error fetching {url}: {str(e)}")
	return None

	def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
	"""Extract content from Snopes articles."""
	# Get headline from any h1 tag since it doesn't have a specific class
	headline_elem = soup.find('h1')
	headline = headline_elem.get_text().strip() if headline_elem else ''
	self.logger.info(f"Found headline: {headline}")

	# Try to find the article content
	article = soup.find('article')
	if article:
	self.logger.info("Found article tag")
	# Remove unwanted elements
	for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
	unwanted.decompose()

	# Get all paragraphs from the article
	paragraphs = article.find_all('p')
	if paragraphs:
	content = ' '.join(p.get_text().strip() for p in paragraphs)
	else:
	content = article.get_text().strip()
	else:
	self.logger.warning("No article tag found")
	content = ''

	return {"headline": headline, "content": content}

	def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
	"""Extract content from PolitiFact articles."""
	try:
	headline = soup.find('h1', class_='article__title')
	if headline:
	headline = headline.get_text().strip()
	else:
	headline = soup.find('h1')
	headline = headline.get_text().strip() if headline else "No headline found"

	self.logger.info(f"Found headline: {headline}")

	content_div = soup.find('article', class_='article')
	if content_div:
	# Remove unwanted elements
	for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
	unwanted.decompose()
	content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
	else:
	# Try alternative content selectors
	content_selectors = ['.article__text', '.m-textblock']
	content = ''
	for selector in content_selectors:
	content_elem = soup.select_one(selector)
	if content_elem:
	content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
	break

	if not content:
	self.logger.warning("No content found in article")
	content = "No content found"

	return {"headline": headline, "content": content}

	except Exception as e:
	self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
	return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}

	def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
	"""
	Main function to scrape fact-checking articles.
	Returns a dictionary with headline and content.
	"""
	html_content = self._fetch_page(url)
	if not html_content:
	self.logger.error("Failed to fetch page content")
	return None

	soup = BeautifulSoup(html_content, 'html.parser')
	domain = self._get_domain(url)

	self.logger.info(f"Scraping article from domain: {domain}")

	# Select appropriate extractor based on domain
	if 'snopes.com' in domain:
	result = self._extract_snopes(soup)
	if not result['headline'] or not result['content']:
	self.logger.warning("Failed to extract content from Snopes article")
	self.logger.debug(f"HTML content: {html_content[:500]}...")
	return result
	elif 'politifact.com' in domain:
	return self._extract_politifact(soup)
	else:
	# Generic extraction fallback
	headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''

	# Try common content selectors
	content_selectors = ['article', 'main', '.content', '.article-content']
	content = ''

	for selector in content_selectors:
	content_div = soup.select_one(selector)
	if content_div:
	# Remove unwanted elements
	for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
	unwanted.decompose()
	content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
	break

	return {"headline": headline, "content": content}