File size: 5,811 Bytes
876b12f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from typing import Dict, Optional
import logging
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup

from ..utils.logging_config import setup_logging

class ArticleScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        setup_logging()
        self.logger = logging.getLogger(__name__)

    def _get_domain(self, url: str) -> str:
        """Extract domain from URL."""
        return urlparse(url).netloc

    def _fetch_page(self, url: str) -> Optional[str]:
        """Fetch page content with error handling."""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            return response.text
            
        except Exception as e:
            self.logger.error(f"Error fetching {url}: {str(e)}")
            return None

    def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extract content from Snopes articles."""
        # Get headline from any h1 tag since it doesn't have a specific class
        headline_elem = soup.find('h1')
        headline = headline_elem.get_text().strip() if headline_elem else ''
        self.logger.info(f"Found headline: {headline}")
        
        # Try to find the article content
        article = soup.find('article')
        if article:
            self.logger.info("Found article tag")
            # Remove unwanted elements
            for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
                unwanted.decompose()
            
            # Get all paragraphs from the article
            paragraphs = article.find_all('p')
            if paragraphs:
                content = ' '.join(p.get_text().strip() for p in paragraphs)
            else:
                content = article.get_text().strip()
        else:
            self.logger.warning("No article tag found")
            content = ''
            
        return {"headline": headline, "content": content}

    def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extract content from PolitiFact articles."""
        try:
            headline = soup.find('h1', class_='article__title')
            if headline:
                headline = headline.get_text().strip()
            else:
                headline = soup.find('h1')
                headline = headline.get_text().strip() if headline else "No headline found"
            
            self.logger.info(f"Found headline: {headline}")
            
            content_div = soup.find('article', class_='article')
            if content_div:
                # Remove unwanted elements
                for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
                    unwanted.decompose()
                content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
            else:
                # Try alternative content selectors
                content_selectors = ['.article__text', '.m-textblock']
                content = ''
                for selector in content_selectors:
                    content_elem = soup.select_one(selector)
                    if content_elem:
                        content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
                        break
                    
            if not content:
                self.logger.warning("No content found in article")
                content = "No content found"
            
            return {"headline": headline, "content": content}
            
        except Exception as e:
            self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
            return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}

    def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
        """
        Main function to scrape fact-checking articles.
        Returns a dictionary with headline and content.
        """
        html_content = self._fetch_page(url)
        if not html_content:
            self.logger.error("Failed to fetch page content")
            return None

        soup = BeautifulSoup(html_content, 'html.parser')
        domain = self._get_domain(url)
        
        self.logger.info(f"Scraping article from domain: {domain}")

        # Select appropriate extractor based on domain
        if 'snopes.com' in domain:
            result = self._extract_snopes(soup)
            if not result['headline'] or not result['content']:
                self.logger.warning("Failed to extract content from Snopes article")
                self.logger.debug(f"HTML content: {html_content[:500]}...")
            return result
        elif 'politifact.com' in domain:
            return self._extract_politifact(soup)
        else:
            # Generic extraction fallback
            headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
            
            # Try common content selectors
            content_selectors = ['article', 'main', '.content', '.article-content']
            content = ''
            
            for selector in content_selectors:
                content_div = soup.select_one(selector)
                if content_div:
                    # Remove unwanted elements
                    for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
                        unwanted.decompose()
                    content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
                    break

            return {"headline": headline, "content": content}