wozwize's picture
initial commit of media-unmasked-api to huggingface
876b12f
raw
history blame
5.81 kB
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from ..utils.logging_config import setup_logging
class ArticleScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
setup_logging()
self.logger = logging.getLogger(__name__)
def _get_domain(self, url: str) -> str:
"""Extract domain from URL."""
return urlparse(url).netloc
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch page content with error handling."""
try:
response = self.session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
self.logger.error(f"Error fetching {url}: {str(e)}")
return None
def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from Snopes articles."""
# Get headline from any h1 tag since it doesn't have a specific class
headline_elem = soup.find('h1')
headline = headline_elem.get_text().strip() if headline_elem else ''
self.logger.info(f"Found headline: {headline}")
# Try to find the article content
article = soup.find('article')
if article:
self.logger.info("Found article tag")
# Remove unwanted elements
for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
# Get all paragraphs from the article
paragraphs = article.find_all('p')
if paragraphs:
content = ' '.join(p.get_text().strip() for p in paragraphs)
else:
content = article.get_text().strip()
else:
self.logger.warning("No article tag found")
content = ''
return {"headline": headline, "content": content}
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from PolitiFact articles."""
try:
headline = soup.find('h1', class_='article__title')
if headline:
headline = headline.get_text().strip()
else:
headline = soup.find('h1')
headline = headline.get_text().strip() if headline else "No headline found"
self.logger.info(f"Found headline: {headline}")
content_div = soup.find('article', class_='article')
if content_div:
# Remove unwanted elements
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
else:
# Try alternative content selectors
content_selectors = ['.article__text', '.m-textblock']
content = ''
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
break
if not content:
self.logger.warning("No content found in article")
content = "No content found"
return {"headline": headline, "content": content}
except Exception as e:
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
"""
Main function to scrape fact-checking articles.
Returns a dictionary with headline and content.
"""
html_content = self._fetch_page(url)
if not html_content:
self.logger.error("Failed to fetch page content")
return None
soup = BeautifulSoup(html_content, 'html.parser')
domain = self._get_domain(url)
self.logger.info(f"Scraping article from domain: {domain}")
# Select appropriate extractor based on domain
if 'snopes.com' in domain:
result = self._extract_snopes(soup)
if not result['headline'] or not result['content']:
self.logger.warning("Failed to extract content from Snopes article")
self.logger.debug(f"HTML content: {html_content[:500]}...")
return result
elif 'politifact.com' in domain:
return self._extract_politifact(soup)
else:
# Generic extraction fallback
headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
# Try common content selectors
content_selectors = ['article', 'main', '.content', '.article-content']
content = ''
for selector in content_selectors:
content_div = soup.select_one(selector)
if content_div:
# Remove unwanted elements
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
break
return {"headline": headline, "content": content}