File size: 4,681 Bytes
c71b3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from urllib.parse import urljoin


def scrape_shl_products():
    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Optional: Run in background
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    )

    # Set up driver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    base_url = "https://www.shl.com"
    catalog_url = "https://www.shl.com/solutions/products/product-catalog/"

    try:
        print("Loading SHL product catalog...")
        driver.get(catalog_url)

        # Wait for products to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card"))
        )

        # Scroll to load all products
        print("Scrolling to load all products...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card")
        print(f"Found {len(product_cards)} products.")

        products = []

        for card in product_cards:
            try:
                product = {
                    'Assessment Name': 'Not found',
                    'URL': 'Not found',
                    'Remote Testing Support': 'No',
                    'Adaptive/IRT Support': 'No',
                    'Duration': 'Not specified',
                    'Test Type': 'Not specified'
                }

                # Name
                name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title")
                product['Assessment Name'] = name_element.text

                # URL
                link_element = card.find_element(By.CSS_SELECTOR, "a[href]")
                product['URL'] = urljoin(base_url, link_element.get_attribute("href"))

                # Metadata
                meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item")
                for item in meta_items:
                    try:
                        label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower()
                        value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text

                        if 'remote' in label:
                            product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No'
                        elif 'adaptive' in label or 'irt' in label:
                            product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No'
                        elif 'duration' in label:
                            product['Duration'] = value
                        elif 'type' in label:
                            product['Test Type'] = value
                    except NoSuchElementException:
                        continue

                products.append(product)

            except Exception as e:
                print(f"Error processing a product card: {str(e)}")
                continue

        # Save data
        df = pd.DataFrame(products)
        df.to_csv('shl_products.csv', index=False)
        print("Data saved to shl_products.csv")

        return df

    except TimeoutException:
        print("Timeout loading the page.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()
        print("Browser closed.")


if __name__ == "__main__":
    print("Starting SHL scraper...")   # Debug print
    df = scrape_shl_products()
    if df is not None and not df.empty:
        print("\nFirst 5 results:")
        print(df.head())
    else:
        print("No data scraped.")