Spaces:
Sleeping
Sleeping
File size: 5,811 Bytes
876b12f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from ..utils.logging_config import setup_logging
class ArticleScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
setup_logging()
self.logger = logging.getLogger(__name__)
def _get_domain(self, url: str) -> str:
"""Extract domain from URL."""
return urlparse(url).netloc
def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch page content with error handling."""
try:
response = self.session.get(url)
response.raise_for_status()
return response.text
except Exception as e:
self.logger.error(f"Error fetching {url}: {str(e)}")
return None
def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from Snopes articles."""
# Get headline from any h1 tag since it doesn't have a specific class
headline_elem = soup.find('h1')
headline = headline_elem.get_text().strip() if headline_elem else ''
self.logger.info(f"Found headline: {headline}")
# Try to find the article content
article = soup.find('article')
if article:
self.logger.info("Found article tag")
# Remove unwanted elements
for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
# Get all paragraphs from the article
paragraphs = article.find_all('p')
if paragraphs:
content = ' '.join(p.get_text().strip() for p in paragraphs)
else:
content = article.get_text().strip()
else:
self.logger.warning("No article tag found")
content = ''
return {"headline": headline, "content": content}
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
"""Extract content from PolitiFact articles."""
try:
headline = soup.find('h1', class_='article__title')
if headline:
headline = headline.get_text().strip()
else:
headline = soup.find('h1')
headline = headline.get_text().strip() if headline else "No headline found"
self.logger.info(f"Found headline: {headline}")
content_div = soup.find('article', class_='article')
if content_div:
# Remove unwanted elements
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
else:
# Try alternative content selectors
content_selectors = ['.article__text', '.m-textblock']
content = ''
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
break
if not content:
self.logger.warning("No content found in article")
content = "No content found"
return {"headline": headline, "content": content}
except Exception as e:
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
"""
Main function to scrape fact-checking articles.
Returns a dictionary with headline and content.
"""
html_content = self._fetch_page(url)
if not html_content:
self.logger.error("Failed to fetch page content")
return None
soup = BeautifulSoup(html_content, 'html.parser')
domain = self._get_domain(url)
self.logger.info(f"Scraping article from domain: {domain}")
# Select appropriate extractor based on domain
if 'snopes.com' in domain:
result = self._extract_snopes(soup)
if not result['headline'] or not result['content']:
self.logger.warning("Failed to extract content from Snopes article")
self.logger.debug(f"HTML content: {html_content[:500]}...")
return result
elif 'politifact.com' in domain:
return self._extract_politifact(soup)
else:
# Generic extraction fallback
headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
# Try common content selectors
content_selectors = ['article', 'main', '.content', '.article-content']
content = ''
for selector in content_selectors:
content_div = soup.select_one(selector)
if content_div:
# Remove unwanted elements
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
unwanted.decompose()
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
break
return {"headline": headline, "content": content} |